From a3c8ab4fe8f006d742c24be677518bfa9862e732 Mon Sep 17 00:00:00 2001
From: Jack Morgenstein <jackm@mellanox.co.il>
Date: Wed, 30 Nov 2005 09:55:22 -0800
Subject: IB/mthca: fix QP size limits for mem-free HCAs

Unlike tavor, the max work queue size is an exact power of 2 for arbel
mode, despite what the documentation (of the QUERY_DEV_LIM firmware
command) says.  Without this patch, on Arbel, we can start with a QP
of a valid size and get above the reported limit after rounding to the
next power of two.

Signed-off-by: Jack Morgenstein <jackm@mellanox.co.il>
Signed-off-by: Michael S. Tsirkin <mst@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/hw/mthca/mthca_cmd.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.c b/drivers/infiniband/hw/mthca/mthca_cmd.c
index 9ed34587fc..22ac72bc20 100644
--- a/drivers/infiniband/hw/mthca/mthca_cmd.c
+++ b/drivers/infiniband/hw/mthca/mthca_cmd.c
@@ -937,10 +937,6 @@ int mthca_QUERY_DEV_LIM(struct mthca_dev *dev,
 	if (err)
 		goto out;
 
-	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SRQ_SZ_OFFSET);
-	dev_lim->max_srq_sz = (1 << field) - 1;
-	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_QP_SZ_OFFSET);
-	dev_lim->max_qp_sz = (1 << field) - 1;
 	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_QP_OFFSET);
 	dev_lim->reserved_qps = 1 << (field & 0xf);
 	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_QP_OFFSET);
@@ -1056,6 +1052,10 @@ int mthca_QUERY_DEV_LIM(struct mthca_dev *dev,
 	mthca_dbg(dev, "Flags: %08x\n", dev_lim->flags);
 
 	if (mthca_is_memfree(dev)) {
+		MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SRQ_SZ_OFFSET);
+		dev_lim->max_srq_sz = 1 << field;
+		MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_QP_SZ_OFFSET);
+		dev_lim->max_qp_sz = 1 << field;
 		MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSZ_SRQ_OFFSET);
 		dev_lim->hca.arbel.resize_srq = field & 1;
 		MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SG_RQ_OFFSET);
@@ -1087,6 +1087,10 @@ int mthca_QUERY_DEV_LIM(struct mthca_dev *dev,
 		mthca_dbg(dev, "Max ICM size %lld MB\n",
 			  (unsigned long long) dev_lim->hca.arbel.max_icm_sz >> 20);
 	} else {
+		MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SRQ_SZ_OFFSET);
+		dev_lim->max_srq_sz = (1 << field) - 1;
+		MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_QP_SZ_OFFSET);
+		dev_lim->max_qp_sz = (1 << field) - 1;
 		MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_AV_OFFSET);
 		dev_lim->hca.tavor.max_avs = 1 << (field & 0x3f);
 		dev_lim->mpt_entry_sz = MTHCA_MPT_ENTRY_SIZE;
-- 
cgit v1.2.2


From 227eca83690da7dcbd698d3268e29402e0571723 Mon Sep 17 00:00:00 2001
From: Sean Hefty <sean.hefty@intel.com>
Date: Wed, 30 Nov 2005 10:00:25 -0800
Subject: IB/cm: correct reported reject code

Change reject code from TIMEOUT to CONSUMER_REJECT when destroying a
cm_id in the process of connecting.

Signed-off-by: Sean Hefty <sean.hefty@intel.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/core/cm.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 02110e00d1..1fe21865d1 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -684,6 +684,13 @@ retest:
 		cm_reject_sidr_req(cm_id_priv, IB_SIDR_REJECT);
 		break;
 	case IB_CM_REQ_SENT:
+		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		ib_send_cm_rej(cm_id, IB_CM_REJ_TIMEOUT,
+			       &cm_id_priv->av.port->cm_dev->ca_guid,
+			       sizeof cm_id_priv->av.port->cm_dev->ca_guid,
+			       NULL, 0);
+		break;
 	case IB_CM_MRA_REQ_RCVD:
 	case IB_CM_REP_SENT:
 	case IB_CM_MRA_REP_RCVD:
@@ -694,10 +701,8 @@ retest:
 	case IB_CM_REP_RCVD:
 	case IB_CM_MRA_REP_SENT:
 		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
-		ib_send_cm_rej(cm_id, IB_CM_REJ_TIMEOUT,
-			       &cm_id_priv->av.port->cm_dev->ca_guid,
-			       sizeof cm_id_priv->av.port->cm_dev->ca_guid,
-			       NULL, 0);
+		ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED,
+			       NULL, 0, NULL, 0);
 		break;
 	case IB_CM_ESTABLISHED:
 		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
-- 
cgit v1.2.2


From de1bb1a64c29bae4f5330c70bd1dc6a62954c9f4 Mon Sep 17 00:00:00 2001
From: Sean Hefty <sean.hefty@intel.com>
Date: Wed, 30 Nov 2005 10:01:13 -0800
Subject: IB/cm: avoid reusing local ID

Use an increasing local ID to avoid re-using identifiers while
messages may still be outstanding on the old ID.  Without this, a
quick connect-disconnect-connect sequence can fail by matching
messages for the new connection with the old connection.

Signed-off-by: Sean Hefty <sean.hefty@intel.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/core/cm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 1fe21865d1..3a611fe549 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -308,10 +308,11 @@ static int cm_alloc_id(struct cm_id_private *cm_id_priv)
 {
 	unsigned long flags;
 	int ret;
+	static int next_id;
 
 	do {
 		spin_lock_irqsave(&cm.lock, flags);
-		ret = idr_get_new_above(&cm.local_id_table, cm_id_priv, 1,
+		ret = idr_get_new_above(&cm.local_id_table, cm_id_priv, next_id++,
 					(__force int *) &cm_id_priv->id.local_id);
 		spin_unlock_irqrestore(&cm.lock, flags);
 	} while( (ret == -EAGAIN) && idr_pre_get(&cm.local_id_table, GFP_KERNEL) );
-- 
cgit v1.2.2


From 0efc4883a6b3de12476cd7a35e638c0a9f5fd75f Mon Sep 17 00:00:00 2001
From: Jack Morgenstein <jackm@mellanox.co.il>
Date: Fri, 9 Dec 2005 13:46:32 -0800
Subject: IB/umad: fix memory leaks

Don't leak packet if it had a timeout, and don't leak timeout struct
if queue_packet() fails.

Signed-off-by: Jack Morgenstein <jackm@mellanox.co.il>
Signed-off-by: Michael S. Tsirkin <mst@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/core/user_mad.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c
index eb7f52537c..c908de8db5 100644
--- a/drivers/infiniband/core/user_mad.c
+++ b/drivers/infiniband/core/user_mad.c
@@ -197,8 +197,8 @@ static void send_handler(struct ib_mad_agent *agent,
 		memcpy(timeout->mad.data, packet->mad.data,
 		       sizeof (struct ib_mad_hdr));
 
-		if (!queue_packet(file, agent, timeout))
-				return;
+		if (queue_packet(file, agent, timeout))
+			kfree(timeout);
 	}
 out:
 	kfree(packet);
-- 
cgit v1.2.2


From 52d0df153c987e4ad57d15f5df91848f65858e5d Mon Sep 17 00:00:00 2001
From: Jack Morgenstein <jackm@mellanox.co.il>
Date: Fri, 9 Dec 2005 13:48:50 -0800
Subject: IB/mthca: fix memory user DB table leak

Free the memory allocated in mthca_init_user_db_tab() when releasing
the db_tab in mthca_cleanup_user_db_tab().

Signed-off-by: Jack Morgenstein <jackm@mellanox.co.il>
Signed-off-by: Michael S. Tsirkin <mst@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/hw/mthca/mthca_memfree.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.c b/drivers/infiniband/hw/mthca/mthca_memfree.c
index d72fe95cba..5798ed00d8 100644
--- a/drivers/infiniband/hw/mthca/mthca_memfree.c
+++ b/drivers/infiniband/hw/mthca/mthca_memfree.c
@@ -485,6 +485,8 @@ void mthca_cleanup_user_db_tab(struct mthca_dev *dev, struct mthca_uar *uar,
 			put_page(db_tab->page[i].mem.page);
 		}
 	}
+
+	kfree(db_tab);
 }
 
 int mthca_alloc_db(struct mthca_dev *dev, enum mthca_db_type type,
-- 
cgit v1.2.2


From 94361cf74a6fca1973d2fed5338d5fb4bcd902fa Mon Sep 17 00:00:00 2001
From: Jack Morgenstein <jackm@mellanox.co.il>
Date: Fri, 9 Dec 2005 16:32:21 -0800
Subject: IB/mthca: check RDMA limits

Add limit checking on rd_atomic and dest_rd_atomic attributes:
especially for max_dest_rd_atomic, a value that is larger than HCA
capability can cause RDB overflow and corruption of another QP.

Signed-off-by: Jack Morgenstein <jackm@mellanox.co.il>
Signed-off-by: Michael S. Tsirkin <mst@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/hw/mthca/mthca_qp.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c
index 7450550db7..c5c3d0edbb 100644
--- a/drivers/infiniband/hw/mthca/mthca_qp.c
+++ b/drivers/infiniband/hw/mthca/mthca_qp.c
@@ -591,6 +591,20 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask)
 		return -EINVAL;
 	}
 
+	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC &&
+	    attr->max_rd_atomic > dev->limits.max_qp_init_rdma) {
+		mthca_dbg(dev, "Max rdma_atomic as initiator %u too large (max is %d)\n",
+			  attr->max_rd_atomic, dev->limits.max_qp_init_rdma);
+		return -EINVAL;
+	}
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC &&
+	    attr->max_dest_rd_atomic > 1 << dev->qp_table.rdb_shift) {
+		mthca_dbg(dev, "Max rdma_atomic as responder %u too large (max %d)\n",
+			  attr->max_dest_rd_atomic, 1 << dev->qp_table.rdb_shift);
+		return -EINVAL;
+	}
+
 	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
 	if (IS_ERR(mailbox))
 		return PTR_ERR(mailbox);
-- 
cgit v1.2.2


From 6aa2e4e8063114bd7cea8616dd5848d3c64b4c36 Mon Sep 17 00:00:00 2001
From: Jack Morgenstein <jackm@mellanox.co.il>
Date: Fri, 9 Dec 2005 16:38:04 -0800
Subject: IB/mthca: correct log2 calculation

Fix thinko in rd_atomic calculation: ffs(x) - 1 does not find the next
power of 2 -- it should be fls(x - 1).

Signed-off-by: Jack Morgenstein <jackm@mellanox.co.il>
Signed-off-by: Michael S. Tsirkin <mst@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/hw/mthca/mthca_qp.c | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c
index c5c3d0edbb..84056a8b79 100644
--- a/drivers/infiniband/hw/mthca/mthca_qp.c
+++ b/drivers/infiniband/hw/mthca/mthca_qp.c
@@ -728,9 +728,9 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask)
 	}
 
 	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) {
-		qp_context->params1 |= cpu_to_be32(min(attr->max_rd_atomic ?
-						       ffs(attr->max_rd_atomic) - 1 : 0,
-						       7) << 21);
+		if (attr->max_rd_atomic)
+			qp_context->params1 |=
+				cpu_to_be32(fls(attr->max_rd_atomic - 1) << 21);
 		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_SRA_MAX);
 	}
 
@@ -769,8 +769,6 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask)
 	}
 
 	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) {
-		u8 rra_max;
-
 		if (qp->resp_depth && !attr->max_dest_rd_atomic) {
 			/*
 			 * Lowering our responder resources to zero.
@@ -798,13 +796,10 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask)
 								MTHCA_QP_OPTPAR_RAE);
 		}
 
-		for (rra_max = 0;
-		     1 << rra_max < attr->max_dest_rd_atomic &&
-			     rra_max < dev->qp_table.rdb_shift;
-		     ++rra_max)
-			; /* nothing */
+		if (attr->max_dest_rd_atomic)
+			qp_context->params2 |=
+				cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21);
 
-		qp_context->params2      |= cpu_to_be32(rra_max << 21);
 		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RRA_MAX);
 
 		qp->resp_depth = attr->max_dest_rd_atomic;
-- 
cgit v1.2.2


From 44b5b0303327cfb23f135b95b2fe5436c81ed27c Mon Sep 17 00:00:00 2001
From: Jack Morgenstein <jackm@mellanox.co.il>
Date: Fri, 9 Dec 2005 16:40:14 -0800
Subject: IB/mthca: don't change driver's copy of attributes if modify QP fails

Only change the driver's copy of the QP attributes in modify QP after
checking the modify QP command completed successfully.

Signed-off-by: Jack Morgenstein <jackm@mellanox.co.il>
Signed-off-by: Michael S. Tsirkin <mst@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/hw/mthca/mthca_qp.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c
index 84056a8b79..3543299ecb 100644
--- a/drivers/infiniband/hw/mthca/mthca_qp.c
+++ b/drivers/infiniband/hw/mthca/mthca_qp.c
@@ -764,8 +764,6 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask)
 		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RWE |
 							MTHCA_QP_OPTPAR_RRE |
 							MTHCA_QP_OPTPAR_RAE);
-
-		qp->atomic_rd_en = attr->qp_access_flags;
 	}
 
 	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) {
@@ -801,8 +799,6 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask)
 				cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21);
 
 		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RRA_MAX);
-
-		qp->resp_depth = attr->max_dest_rd_atomic;
 	}
 
 	qp_context->params2 |= cpu_to_be32(MTHCA_QP_BIT_RSC);
@@ -844,8 +840,13 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask)
 		err = -EINVAL;
 	}
 
-	if (!err)
+	if (!err) {
 		qp->state = new_state;
+		if (attr_mask & IB_QP_ACCESS_FLAGS)
+			qp->atomic_rd_en = attr->qp_access_flags;
+		if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+			qp->resp_depth = attr->max_dest_rd_atomic;
+	}
 
 	mthca_free_mailbox(dev, mailbox);
 
-- 
cgit v1.2.2


From 6c7d2a75b512c64c910b69adf32dbaddb461910b Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@mellanox.co.il>
Date: Thu, 15 Dec 2005 13:55:50 -0800
Subject: IB/mthca: Fix thinko in mthca_table_find()

break only escapes from the innermost loop, and we want to escape both
loops and return an answer.  Noticed by Ishai Rabinovitch.

Signed-off-by: Michael S. Tsirkin <mst@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/hw/mthca/mthca_memfree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.c b/drivers/infiniband/hw/mthca/mthca_memfree.c
index 5798ed00d8..9fb985a016 100644
--- a/drivers/infiniband/hw/mthca/mthca_memfree.c
+++ b/drivers/infiniband/hw/mthca/mthca_memfree.c
@@ -233,7 +233,7 @@ void *mthca_table_find(struct mthca_icm_table *table, int obj)
 		for (i = 0; i < chunk->npages; ++i) {
 			if (chunk->mem[i].length >= offset) {
 				page = chunk->mem[i].page;
-				break;
+				goto out;
 			}
 			offset -= chunk->mem[i].length;
 		}
-- 
cgit v1.2.2


From 576d2e4e40315e8140c04be99cd057720d8a3817 Mon Sep 17 00:00:00 2001
From: Jack Morgenstein <jackm@mellanox.co.il>
Date: Thu, 15 Dec 2005 14:20:23 -0800
Subject: IB/mthca: Fix SRQ cleanup during QP destroy

When cleaning up a CQ for a QP attached to SRQ, need to free an SRQ
WQE only if the CQE is a receive completion.

Signed-off-by: Jack Morgenstein <jackm@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/hw/mthca/mthca_cq.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/infiniband/hw/mthca/mthca_cq.c b/drivers/infiniband/hw/mthca/mthca_cq.c
index 4a8adcef20..fcef8dc2c1 100644
--- a/drivers/infiniband/hw/mthca/mthca_cq.c
+++ b/drivers/infiniband/hw/mthca/mthca_cq.c
@@ -253,6 +253,15 @@ void mthca_cq_event(struct mthca_dev *dev, u32 cqn,
 		wake_up(&cq->wait);
 }
 
+static inline int is_recv_cqe(struct mthca_cqe *cqe)
+{
+	if ((cqe->opcode & MTHCA_ERROR_CQE_OPCODE_MASK) ==
+	    MTHCA_ERROR_CQE_OPCODE_MASK)
+		return !(cqe->opcode & 0x01);
+	else
+		return !(cqe->is_send & 0x80);
+}
+
 void mthca_cq_clean(struct mthca_dev *dev, u32 cqn, u32 qpn,
 		    struct mthca_srq *srq)
 {
@@ -296,7 +305,7 @@ void mthca_cq_clean(struct mthca_dev *dev, u32 cqn, u32 qpn,
 	while ((int) --prod_index - (int) cq->cons_index >= 0) {
 		cqe = get_cqe(cq, prod_index & cq->ibcq.cqe);
 		if (cqe->my_qpn == cpu_to_be32(qpn)) {
-			if (srq)
+			if (srq && is_recv_cqe(cqe))
 				mthca_free_srq_wqe(srq, be32_to_cpu(cqe->wqe));
 			++nfreed;
 		} else if (nfreed)
-- 
cgit v1.2.2


From d1646f86a2a05a956adbb163c81a81bd621f055e Mon Sep 17 00:00:00 2001
From: Jack Morgenstein <jackm@mellanox.co.il>
Date: Thu, 15 Dec 2005 14:36:24 -0800
Subject: IB/mthca: Fix IB_QP_ACCESS_FLAGS handling.

This patch corrects some corner cases in managing the RAE/RRE bits in
the mthca qp context.  These bits need to be zero if the user requests
max_dest_rd_atomic of zero.  The bits need to be restored to the value
implied by the qp access flags attribute in a previous (or the
current) modify-qp command if the dest_rd_atomic variable is changed
to non-zero.

In the current implementation, the following scenario will not work:
RESET-to-INIT 	set QP access flags to all disabled (zeroes)
INIT-to-RTR     set max_dest_rd_atomic=10, AND
		set qp_access_flags = IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_ATOMIC

The current code will incorrectly take the access-flags value set in
the RESET-to-INIT transition.

We can simplify, and correct, this IB_QP_ACCESS_FLAGS handling: it is
always safe to set qp access flags in the firmware command if either
of IB_QP_MAX_DEST_RD_ATOMIC or IB_QP_ACCESS_FLAGS is set, so let's
just set it to the correct value, always.

Signed-off-by: Jack Morgenstein <jackm@mellanox.co.il>
Signed-off-by: Michael S. Tsirkin <mst@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/hw/mthca/mthca_qp.c | 87 +++++++++++++++-------------------
 1 file changed, 37 insertions(+), 50 deletions(-)

diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c
index 3543299ecb..e826c9ff5d 100644
--- a/drivers/infiniband/hw/mthca/mthca_qp.c
+++ b/drivers/infiniband/hw/mthca/mthca_qp.c
@@ -522,6 +522,36 @@ static void init_port(struct mthca_dev *dev, int port)
 		mthca_warn(dev, "INIT_IB returned status %02x.\n", status);
 }
 
+static __be32 get_hw_access_flags(struct mthca_qp *qp, struct ib_qp_attr *attr,
+				  int attr_mask)
+{
+	u8 dest_rd_atomic;
+	u32 access_flags;
+	u32 hw_access_flags = 0;
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+		dest_rd_atomic = attr->max_dest_rd_atomic;
+	else
+		dest_rd_atomic = qp->resp_depth;
+
+	if (attr_mask & IB_QP_ACCESS_FLAGS)
+		access_flags = attr->qp_access_flags;
+	else
+		access_flags = qp->atomic_rd_en;
+
+	if (!dest_rd_atomic)
+		access_flags &= IB_ACCESS_REMOTE_WRITE;
+
+	if (access_flags & IB_ACCESS_REMOTE_READ)
+		hw_access_flags |= MTHCA_QP_BIT_RRE;
+	if (access_flags & IB_ACCESS_REMOTE_ATOMIC)
+		hw_access_flags |= MTHCA_QP_BIT_RAE;
+	if (access_flags & IB_ACCESS_REMOTE_WRITE)
+		hw_access_flags |= MTHCA_QP_BIT_RWE;
+
+	return cpu_to_be32(hw_access_flags);
+}
+
 int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask)
 {
 	struct mthca_dev *dev = to_mdev(ibqp->device);
@@ -743,57 +773,7 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask)
 		qp_context->snd_db_index   = cpu_to_be32(qp->sq.db_index);
 	}
 
-	if (attr_mask & IB_QP_ACCESS_FLAGS) {
-		qp_context->params2 |=
-			cpu_to_be32(attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE ?
-				    MTHCA_QP_BIT_RWE : 0);
-
-		/*
-		 * Only enable RDMA reads and atomics if we have
-		 * responder resources set to a non-zero value.
-		 */
-		if (qp->resp_depth) {
-			qp_context->params2 |=
-				cpu_to_be32(attr->qp_access_flags & IB_ACCESS_REMOTE_READ ?
-					    MTHCA_QP_BIT_RRE : 0);
-			qp_context->params2 |=
-				cpu_to_be32(attr->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC ?
-					    MTHCA_QP_BIT_RAE : 0);
-		}
-
-		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RWE |
-							MTHCA_QP_OPTPAR_RRE |
-							MTHCA_QP_OPTPAR_RAE);
-	}
-
 	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) {
-		if (qp->resp_depth && !attr->max_dest_rd_atomic) {
-			/*
-			 * Lowering our responder resources to zero.
-			 * Turn off reads RDMA and atomics as responder.
-			 * (RRE/RAE in params2 already zero)
-			 */
-			qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RRE |
-								MTHCA_QP_OPTPAR_RAE);
-		}
-
-		if (!qp->resp_depth && attr->max_dest_rd_atomic) {
-			/*
-			 * Increasing our responder resources from
-			 * zero.  Turn on RDMA reads and atomics as
-			 * appropriate.
-			 */
-			qp_context->params2 |=
-				cpu_to_be32(qp->atomic_rd_en & IB_ACCESS_REMOTE_READ ?
-					    MTHCA_QP_BIT_RRE : 0);
-			qp_context->params2 |=
-				cpu_to_be32(qp->atomic_rd_en & IB_ACCESS_REMOTE_ATOMIC ?
-					    MTHCA_QP_BIT_RAE : 0);
-
-			qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RRE |
-								MTHCA_QP_OPTPAR_RAE);
-		}
-
 		if (attr->max_dest_rd_atomic)
 			qp_context->params2 |=
 				cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21);
@@ -801,6 +781,13 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask)
 		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RRA_MAX);
 	}
 
+	if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) {
+		qp_context->params2      |= get_hw_access_flags(qp, attr, attr_mask);
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RWE |
+							MTHCA_QP_OPTPAR_RRE |
+							MTHCA_QP_OPTPAR_RAE);
+	}
+
 	qp_context->params2 |= cpu_to_be32(MTHCA_QP_BIT_RSC);
 
 	if (ibqp->srq)
-- 
cgit v1.2.2


From c4342d8a4d95e18b957b898dbf5bfce28fca2780 Mon Sep 17 00:00:00 2001
From: Jack Morgenstein <jackm@mellanox.co.il>
Date: Thu, 15 Dec 2005 19:59:01 -0800
Subject: IB/mthca: Fix corner cases in max_rd_atomic value handling in modify
 QP

sae and sre bits should only be set when setting sra_max.  Further, in
the old code, if the caller specifies max_rd_atomic = 0, the sre and
sae bits are still set, with the result that the QP ends up with
max_rd_atomic = 1 in effect.

Signed-off-by: Jack Morgenstein <jackm@mellanox.co.il>
Signed-off-by: Michael S. Tsirkin <mst@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/hw/mthca/mthca_qp.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c
index e826c9ff5d..d786ef4436 100644
--- a/drivers/infiniband/hw/mthca/mthca_qp.c
+++ b/drivers/infiniband/hw/mthca/mthca_qp.c
@@ -747,9 +747,7 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask)
 	qp_context->wqe_lkey   = cpu_to_be32(qp->mr.ibmr.lkey);
 	qp_context->params1    = cpu_to_be32((MTHCA_ACK_REQ_FREQ << 28) |
 					     (MTHCA_FLIGHT_LIMIT << 24) |
-					     MTHCA_QP_BIT_SRE           |
-					     MTHCA_QP_BIT_SWE           |
-					     MTHCA_QP_BIT_SAE);
+					     MTHCA_QP_BIT_SWE);
 	if (qp->sq_policy == IB_SIGNAL_ALL_WR)
 		qp_context->params1 |= cpu_to_be32(MTHCA_QP_BIT_SSC);
 	if (attr_mask & IB_QP_RETRY_CNT) {
@@ -758,9 +756,13 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask)
 	}
 
 	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) {
-		if (attr->max_rd_atomic)
+		if (attr->max_rd_atomic) {
+			qp_context->params1 |=
+				cpu_to_be32(MTHCA_QP_BIT_SRE |
+					    MTHCA_QP_BIT_SAE);
 			qp_context->params1 |=
 				cpu_to_be32(fls(attr->max_rd_atomic - 1) << 21);
+		}
 		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_SRA_MAX);
 	}
 
-- 
cgit v1.2.2


From 1d7d2f6f476cf7aa65f9f740a6c932fb75608110 Mon Sep 17 00:00:00 2001
From: Jack Morgenstein <jackm@mellanox.co.il>
Date: Wed, 4 Jan 2006 14:42:39 -0800
Subject: IB/mthca: fix WQE size calculation in create-srq

Thinko: 64 bytes is the minimum SRQ WQE size (not the maximum).

Signed-off-by: Jack Morgenstein <jackm@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/hw/mthca/mthca_srq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/infiniband/hw/mthca/mthca_srq.c b/drivers/infiniband/hw/mthca/mthca_srq.c
index f7d234295e..e7e153d9c4 100644
--- a/drivers/infiniband/hw/mthca/mthca_srq.c
+++ b/drivers/infiniband/hw/mthca/mthca_srq.c
@@ -201,7 +201,7 @@ int mthca_alloc_srq(struct mthca_dev *dev, struct mthca_pd *pd,
 	if (mthca_is_memfree(dev))
 		srq->max = roundup_pow_of_two(srq->max + 1);
 
-	ds = min(64UL,
+	ds = max(64UL,
 		 roundup_pow_of_two(sizeof (struct mthca_next_seg) +
 				    srq->max_gs * sizeof (struct mthca_data_seg)));
 	srq->wqe_shift = long_log2(ds);
-- 
cgit v1.2.2


From aa2f9367790ad81ef51d3f667124227ca3003d3b Mon Sep 17 00:00:00 2001
From: Jack Morgenstein <jackm@mellanox.co.il>
Date: Thu, 5 Jan 2006 16:12:01 -0800
Subject: IB/mthca: check return value in mthca_dev_lim call

Check error return on call to mthca_dev_lim for Tavor
(as is done for memfree).

Signed-off-by: Jack Morgenstein <jackm@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/hw/mthca/mthca_main.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c
index 6f94b25f3a..8b00d9a0f6 100644
--- a/drivers/infiniband/hw/mthca/mthca_main.c
+++ b/drivers/infiniband/hw/mthca/mthca_main.c
@@ -261,6 +261,10 @@ static int __devinit mthca_init_tavor(struct mthca_dev *mdev)
 	}
 
 	err = mthca_dev_lim(mdev, &dev_lim);
+	if (err) {
+		mthca_err(mdev, "QUERY_DEV_LIM command failed, aborting.\n");
+		goto err_disable;
+	}
 
 	profile = default_profile;
 	profile.num_uar   = dev_lim.uar_size / PAGE_SIZE;
-- 
cgit v1.2.2


From 38d1e793471d95728219f500bbb8bd25658d73b0 Mon Sep 17 00:00:00 2001
From: Jack Morgenstein <jackm@mellanox.co.il>
Date: Thu, 5 Jan 2006 16:13:46 -0800
Subject: IB/mthca: check port validity in modify_qp

Modify_qp should check that the physical port number provided
is a legal value.

Signed-off-by: Jack Morgenstein <jackm@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/hw/mthca/mthca_qp.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c
index d786ef4436..ea45fa400f 100644
--- a/drivers/infiniband/hw/mthca/mthca_qp.c
+++ b/drivers/infiniband/hw/mthca/mthca_qp.c
@@ -621,6 +621,12 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask)
 		return -EINVAL;
 	}
 
+	if ((attr_mask & IB_QP_PORT) &&
+	    (attr->port_num == 0 || attr->port_num > dev->limits.num_ports)) {
+		mthca_dbg(dev, "Port number (%u) is invalid\n", attr->port_num);
+		return -EINVAL;
+	}
+
 	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC &&
 	    attr->max_rd_atomic > dev->limits.max_qp_init_rdma) {
 		mthca_dbg(dev, "Max rdma_atomic as initiator %u too large (max is %d)\n",
-- 
cgit v1.2.2


From 466200562ccd80f728f7ef602d2b97b4fdedd566 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@mellanox.co.il>
Date: Thu, 5 Jan 2006 16:17:38 -0800
Subject: IB/mthca: create_eq with size not a power of 2

Fix mthca_create_eq for when the EQ size is not a power of 2.

Signed-off-by: Michael S. Tsirkin <mst@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/hw/mthca/mthca_eq.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/hw/mthca/mthca_eq.c b/drivers/infiniband/hw/mthca/mthca_eq.c
index 34d68e5a72..e8a948f087 100644
--- a/drivers/infiniband/hw/mthca/mthca_eq.c
+++ b/drivers/infiniband/hw/mthca/mthca_eq.c
@@ -484,8 +484,7 @@ static int __devinit mthca_create_eq(struct mthca_dev *dev,
 				     u8 intr,
 				     struct mthca_eq *eq)
 {
-	int npages = (nent * MTHCA_EQ_ENTRY_SIZE + PAGE_SIZE - 1) /
-		PAGE_SIZE;
+	int npages;
 	u64 *dma_list = NULL;
 	dma_addr_t t;
 	struct mthca_mailbox *mailbox;
@@ -496,6 +495,7 @@ static int __devinit mthca_create_eq(struct mthca_dev *dev,
 
 	eq->dev  = dev;
 	eq->nent = roundup_pow_of_two(max(nent, 2));
+ 	npages = ALIGN(eq->nent * MTHCA_EQ_ENTRY_SIZE, PAGE_SIZE) / PAGE_SIZE;
 
 	eq->page_list = kmalloc(npages * sizeof *eq->page_list,
 				GFP_KERNEL);
-- 
cgit v1.2.2


From 5b3bc7a68171138d52b1b62012c37ac888895460 Mon Sep 17 00:00:00 2001
From: Jack Morgenstein <jackm@mellanox.co.il>
Date: Fri, 6 Jan 2006 12:57:30 -0800
Subject: IB/mthca: max_inline_data handling tweaks

Fix a case where copying max_inline_data from a successful create_qp
capabilities output to create_qp input could cause EINVAL error:

mthca_set_qp_size must check max_inline_data directly against
max_desc_sz; checking qp->sq.max_gs is wrong since max_inline_data
depends on the qp type and does not involve max_sg.

Signed-off-by: Jack Morgenstein <jackm@mellanox.co.il>
Signed-off-by: Michael S. Tsirkin <mst@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/hw/mthca/mthca_qp.c | 62 ++++++++++++++++++++--------------
 1 file changed, 36 insertions(+), 26 deletions(-)

diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c
index ea45fa400f..fd60cf3a5b 100644
--- a/drivers/infiniband/hw/mthca/mthca_qp.c
+++ b/drivers/infiniband/hw/mthca/mthca_qp.c
@@ -890,18 +890,13 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask)
 	return err;
 }
 
-static void mthca_adjust_qp_caps(struct mthca_dev *dev,
-				 struct mthca_pd *pd,
-				 struct mthca_qp *qp)
+static int mthca_max_data_size(struct mthca_dev *dev, struct mthca_qp *qp, int desc_sz)
 {
-	int max_data_size;
-
 	/*
 	 * Calculate the maximum size of WQE s/g segments, excluding
 	 * the next segment and other non-data segments.
 	 */
-	max_data_size = min(dev->limits.max_desc_sz, 1 << qp->sq.wqe_shift) -
-		sizeof (struct mthca_next_seg);
+	int max_data_size = desc_sz - sizeof (struct mthca_next_seg);
 
 	switch (qp->transport) {
 	case MLX:
@@ -920,11 +915,24 @@ static void mthca_adjust_qp_caps(struct mthca_dev *dev,
 		break;
 	}
 
+	return max_data_size;
+}
+
+static inline int mthca_max_inline_data(struct mthca_pd *pd, int max_data_size)
+{
 	/* We don't support inline data for kernel QPs (yet). */
-	if (!pd->ibpd.uobject)
-		qp->max_inline_data = 0;
-        else
-		qp->max_inline_data = max_data_size - MTHCA_INLINE_HEADER_SIZE;
+	return pd->ibpd.uobject ? max_data_size - MTHCA_INLINE_HEADER_SIZE : 0;
+}
+
+static void mthca_adjust_qp_caps(struct mthca_dev *dev,
+				 struct mthca_pd *pd,
+				 struct mthca_qp *qp)
+{
+	int max_data_size = mthca_max_data_size(dev, qp,
+						min(dev->limits.max_desc_sz,
+						    1 << qp->sq.wqe_shift));
+
+	qp->max_inline_data = mthca_max_inline_data(pd, max_data_size);
 
 	qp->sq.max_gs = min_t(int, dev->limits.max_sg,
 			      max_data_size / sizeof (struct mthca_data_seg));
@@ -1191,13 +1199,23 @@ static int mthca_alloc_qp_common(struct mthca_dev *dev,
 }
 
 static int mthca_set_qp_size(struct mthca_dev *dev, struct ib_qp_cap *cap,
-			     struct mthca_qp *qp)
+			     struct mthca_pd *pd, struct mthca_qp *qp)
 {
+	int max_data_size = mthca_max_data_size(dev, qp, dev->limits.max_desc_sz);
+
 	/* Sanity check QP size before proceeding */
-	if (cap->max_send_wr  > dev->limits.max_wqes ||
-	    cap->max_recv_wr  > dev->limits.max_wqes ||
-	    cap->max_send_sge > dev->limits.max_sg   ||
-	    cap->max_recv_sge > dev->limits.max_sg)
+	if (cap->max_send_wr  	 > dev->limits.max_wqes ||
+	    cap->max_recv_wr  	 > dev->limits.max_wqes ||
+	    cap->max_send_sge 	 > dev->limits.max_sg   ||
+	    cap->max_recv_sge 	 > dev->limits.max_sg   ||
+	    cap->max_inline_data > mthca_max_inline_data(pd, max_data_size))
+		return -EINVAL;
+
+	/*
+	 * For MLX transport we need 2 extra S/G entries:
+	 * one for the header and one for the checksum at the end
+	 */
+	if (qp->transport == MLX && cap->max_recv_sge + 2 > dev->limits.max_sg)
 		return -EINVAL;
 
 	if (mthca_is_memfree(dev)) {
@@ -1216,14 +1234,6 @@ static int mthca_set_qp_size(struct mthca_dev *dev, struct ib_qp_cap *cap,
 				    MTHCA_INLINE_CHUNK_SIZE) /
 			      sizeof (struct mthca_data_seg));
 
-	/*
-	 * For MLX transport we need 2 extra S/G entries:
-	 * one for the header and one for the checksum at the end
-	 */
-	if ((qp->transport == MLX && qp->sq.max_gs + 2 > dev->limits.max_sg) ||
-	    qp->sq.max_gs > dev->limits.max_sg || qp->rq.max_gs > dev->limits.max_sg)
-		return -EINVAL;
-
 	return 0;
 }
 
@@ -1238,7 +1248,7 @@ int mthca_alloc_qp(struct mthca_dev *dev,
 {
 	int err;
 
-	err = mthca_set_qp_size(dev, cap, qp);
+	err = mthca_set_qp_size(dev, cap, pd, qp);
 	if (err)
 		return err;
 
@@ -1281,7 +1291,7 @@ int mthca_alloc_sqp(struct mthca_dev *dev,
 	u32 mqpn = qpn * 2 + dev->qp_table.sqp_start + port - 1;
 	int err;
 
-	err = mthca_set_qp_size(dev, cap, &sqp->qp);
+	err = mthca_set_qp_size(dev, cap, pd, &sqp->qp);
 	if (err)
 		return err;
 
-- 
cgit v1.2.2


From 0364ffc3e8c441d4185e3eb41ecc61dbb09614e4 Mon Sep 17 00:00:00 2001
From: Jack Morgenstein <jackm@mellanox.co.il>
Date: Fri, 6 Jan 2006 13:01:27 -0800
Subject: IB/mthca: fix for SQEr-to-RTS transition in modify QP

Fixes to SQEr->RTS transition in modify_qp:
1. The flag IB_QP_ACCESS_FLAGS is optional for UC qps
2. The SQEr state is not supported for RC qps

Signed-off-by: Jack Morgenstein <jackm@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/hw/mthca/mthca_qp.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c
index fd60cf3a5b..623f5144ea 100644
--- a/drivers/infiniband/hw/mthca/mthca_qp.c
+++ b/drivers/infiniband/hw/mthca/mthca_qp.c
@@ -476,9 +476,8 @@ static const struct {
 			.opt_param = {
 				[UD]  = (IB_QP_CUR_STATE             |
 					 IB_QP_QKEY),
-				[UC]  = IB_QP_CUR_STATE,
-				[RC]  = (IB_QP_CUR_STATE             |
-					 IB_QP_MIN_RNR_TIMER),
+				[UC]  = (IB_QP_CUR_STATE             |
+					 IB_QP_ACCESS_FLAGS),
 				[MLX] = (IB_QP_CUR_STATE             |
 					 IB_QP_QKEY),
 			}
-- 
cgit v1.2.2


From 0d3b525fff40475e58dab9176740d2efc5f37838 Mon Sep 17 00:00:00 2001
From: Jack Morgenstein <jackm@mellanox.co.il>
Date: Fri, 6 Jan 2006 13:03:43 -0800
Subject: IB/mthca: fix for RTR-to-RTS transition in modify QP

PKEY_INDEX is not a legal parameter in the RTR->RTS transition.

Signed-off-by: Jack Morgenstein <jackm@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/hw/mthca/mthca_qp.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c
index 623f5144ea..ff2def3e9d 100644
--- a/drivers/infiniband/hw/mthca/mthca_qp.c
+++ b/drivers/infiniband/hw/mthca/mthca_qp.c
@@ -383,12 +383,10 @@ static const struct {
 				[UC]  = (IB_QP_CUR_STATE             |
 					 IB_QP_ALT_PATH              |
 					 IB_QP_ACCESS_FLAGS          |
-					 IB_QP_PKEY_INDEX            |
 					 IB_QP_PATH_MIG_STATE),
 				[RC]  = (IB_QP_CUR_STATE             |
 					 IB_QP_ALT_PATH              |
 					 IB_QP_ACCESS_FLAGS          |
-					 IB_QP_PKEY_INDEX            |
 					 IB_QP_MIN_RNR_TIMER         |
 					 IB_QP_PATH_MIG_STATE),
 				[MLX] = (IB_QP_CUR_STATE             |
-- 
cgit v1.2.2


From 5ceb74557c71465cf8f6fda050aac00e53f9ad3d Mon Sep 17 00:00:00 2001
From: Jack Morgenstein <jackm@mellanox.co.il>
Date: Fri, 6 Jan 2006 13:11:07 -0800
Subject: IB/mthca: multiple fixes for multicast group handling

Multicast group management fixes:
. Fix leak of mailbox memory in error handling on multicast group operations.
. Free AMGM indices at detach and in attach error handling.
. Fix amount to shift for aligning next_gid_index in mailbox: it
  starts at bit 6, not bit 5.
. Allocate AMGM index after end of MGM table, in the range num_mgms to
  multicast table size - 1. Add some BUG_ON checks to catch cases
  where the index falls in the MGM hash area.
. Initialize the list of QPs in a newly-allocated group from AMGM to 0
  This is necessary since when a group is moved from AMGM to MGM (in the
  case where the MGM entry has been emptied of QPs), the AMGM entry is
  not reset to 0 (and we don't want an extra command to do that).

Signed-off-by: Jack Morgenstein <jackm@mellanox.co.il>
Signed-off-by: Michael S. Tsirkin <mst@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/hw/mthca/mthca_mcg.c | 54 +++++++++++++++++++++------------
 1 file changed, 35 insertions(+), 19 deletions(-)

diff --git a/drivers/infiniband/hw/mthca/mthca_mcg.c b/drivers/infiniband/hw/mthca/mthca_mcg.c
index 2fc449da41..77bc6c746f 100644
--- a/drivers/infiniband/hw/mthca/mthca_mcg.c
+++ b/drivers/infiniband/hw/mthca/mthca_mcg.c
@@ -111,7 +111,8 @@ static int find_mgm(struct mthca_dev *dev,
 			goto out;
 		if (status) {
 			mthca_err(dev, "READ_MGM returned status %02x\n", status);
-			return -EINVAL;
+			err = -EINVAL;
+			goto out;
 		}
 
 		if (!memcmp(mgm->gid, zero_gid, 16)) {
@@ -126,7 +127,7 @@ static int find_mgm(struct mthca_dev *dev,
 			goto out;
 
 		*prev = *index;
-		*index = be32_to_cpu(mgm->next_gid_index) >> 5;
+		*index = be32_to_cpu(mgm->next_gid_index) >> 6;
 	} while (*index);
 
 	*index = -1;
@@ -153,8 +154,10 @@ int mthca_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
 		return PTR_ERR(mailbox);
 	mgm = mailbox->buf;
 
-	if (down_interruptible(&dev->mcg_table.sem))
-		return -EINTR;
+	if (down_interruptible(&dev->mcg_table.sem)) {
+		err = -EINTR;
+		goto err_sem;
+	}
 
 	err = find_mgm(dev, gid->raw, mailbox, &hash, &prev, &index);
 	if (err)
@@ -181,9 +184,8 @@ int mthca_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
 			err = -EINVAL;
 			goto out;
 		}
-
+		memset(mgm, 0, sizeof *mgm);
 		memcpy(mgm->gid, gid->raw, 16);
-		mgm->next_gid_index = 0;
 	}
 
 	for (i = 0; i < MTHCA_QP_PER_MGM; ++i)
@@ -209,6 +211,7 @@ int mthca_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
 	if (status) {
 		mthca_err(dev, "WRITE_MGM returned status %02x\n", status);
 		err = -EINVAL;
+		goto out;
 	}
 
 	if (!link)
@@ -223,7 +226,7 @@ int mthca_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
 		goto out;
 	}
 
-	mgm->next_gid_index = cpu_to_be32(index << 5);
+	mgm->next_gid_index = cpu_to_be32(index << 6);
 
 	err = mthca_WRITE_MGM(dev, prev, mailbox, &status);
 	if (err)
@@ -234,7 +237,12 @@ int mthca_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
 	}
 
  out:
+	if (err && link && index != -1) {
+		BUG_ON(index < dev->limits.num_mgms);
+		mthca_free(&dev->mcg_table.alloc, index);
+	}
 	up(&dev->mcg_table.sem);
+ err_sem:
 	mthca_free_mailbox(dev, mailbox);
 	return err;
 }
@@ -255,8 +263,10 @@ int mthca_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
 		return PTR_ERR(mailbox);
 	mgm = mailbox->buf;
 
-	if (down_interruptible(&dev->mcg_table.sem))
-		return -EINTR;
+	if (down_interruptible(&dev->mcg_table.sem)) {
+		err = -EINTR;
+		goto err_sem;
+	}
 
 	err = find_mgm(dev, gid->raw, mailbox, &hash, &prev, &index);
 	if (err)
@@ -305,13 +315,11 @@ int mthca_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
 	if (i != 1)
 		goto out;
 
-	goto out;
-
 	if (prev == -1) {
 		/* Remove entry from MGM */
-		if (be32_to_cpu(mgm->next_gid_index) >> 5) {
-			err = mthca_READ_MGM(dev,
-					     be32_to_cpu(mgm->next_gid_index) >> 5,
+		int amgm_index_to_free = be32_to_cpu(mgm->next_gid_index) >> 6;
+		if (amgm_index_to_free) {
+			err = mthca_READ_MGM(dev, amgm_index_to_free,
 					     mailbox, &status);
 			if (err)
 				goto out;
@@ -332,9 +340,13 @@ int mthca_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
 			err = -EINVAL;
 			goto out;
 		}
+		if (amgm_index_to_free) {
+			BUG_ON(amgm_index_to_free < dev->limits.num_mgms);
+			mthca_free(&dev->mcg_table.alloc, amgm_index_to_free);
+		}
 	} else {
 		/* Remove entry from AMGM */
-		index = be32_to_cpu(mgm->next_gid_index) >> 5;
+		int curr_next_index = be32_to_cpu(mgm->next_gid_index) >> 6;
 		err = mthca_READ_MGM(dev, prev, mailbox, &status);
 		if (err)
 			goto out;
@@ -344,7 +356,7 @@ int mthca_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
 			goto out;
 		}
 
-		mgm->next_gid_index = cpu_to_be32(index << 5);
+		mgm->next_gid_index = cpu_to_be32(curr_next_index << 6);
 
 		err = mthca_WRITE_MGM(dev, prev, mailbox, &status);
 		if (err)
@@ -354,10 +366,13 @@ int mthca_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
 			err = -EINVAL;
 			goto out;
 		}
+		BUG_ON(index < dev->limits.num_mgms);
+		mthca_free(&dev->mcg_table.alloc, index);
 	}
 
  out:
 	up(&dev->mcg_table.sem);
+ err_sem:
 	mthca_free_mailbox(dev, mailbox);
 	return err;
 }
@@ -365,11 +380,12 @@ int mthca_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
 int __devinit mthca_init_mcg_table(struct mthca_dev *dev)
 {
 	int err;
+	int table_size = dev->limits.num_mgms + dev->limits.num_amgms;
 
 	err = mthca_alloc_init(&dev->mcg_table.alloc,
-			       dev->limits.num_amgms,
-			       dev->limits.num_amgms - 1,
-			       0);
+			       table_size,
+			       table_size - 1,
+			       dev->limits.num_mgms);
 	if (err)
 		return err;
 
-- 
cgit v1.2.2


From 0f8e8f9607d77ffc1f9820446dfcf781e96fdfd4 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@mellanox.co.il>
Date: Fri, 6 Jan 2006 13:13:32 -0800
Subject: IB/mthca: Fill in vendor_err field in completion with error

Fill vendor_err field in completion with error.

Signed-off-by: Michael S. Tsirkin <mst@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/hw/mthca/mthca_cq.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/drivers/infiniband/hw/mthca/mthca_cq.c b/drivers/infiniband/hw/mthca/mthca_cq.c
index fcef8dc2c1..96f1a86bf0 100644
--- a/drivers/infiniband/hw/mthca/mthca_cq.c
+++ b/drivers/infiniband/hw/mthca/mthca_cq.c
@@ -128,12 +128,12 @@ struct mthca_err_cqe {
 	__be32 my_qpn;
 	u32    reserved1[3];
 	u8     syndrome;
-	u8     reserved2;
+	u8     vendor_err;
 	__be16 db_cnt;
-	u32    reserved3;
+	u32    reserved2;
 	__be32 wqe;
 	u8     opcode;
-	u8     reserved4[2];
+	u8     reserved3[2];
 	u8     owner;
 };
 
@@ -342,8 +342,8 @@ static int handle_error_cqe(struct mthca_dev *dev, struct mthca_cq *cq,
 	}
 
 	/*
-	 * For completions in error, only work request ID, status (and
-	 * freed resource count for RD) have to be set.
+	 * For completions in error, only work request ID, status, vendor error
+	 * (and freed resource count for RD) have to be set.
 	 */
 	switch (cqe->syndrome) {
 	case SYNDROME_LOCAL_LENGTH_ERR:
@@ -405,6 +405,8 @@ static int handle_error_cqe(struct mthca_dev *dev, struct mthca_cq *cq,
 		break;
 	}
 
+	entry->vendor_err = cqe->vendor_err;
+
 	/*
 	 * Mem-free HCAs always generate one CQE per WQE, even in the
 	 * error case, so we don't have to check the doorbell count, etc.
-- 
cgit v1.2.2


From 4de144bf721e46e7ccc8fed45b20a640cc364904 Mon Sep 17 00:00:00 2001
From: Dotan Barak <dotanb@mellanox.co.il>
Date: Fri, 6 Jan 2006 13:23:58 -0800
Subject: IB/mthca: Add support for automatic path migration (APM)

Add code to modify QP operation to handle setting alternate paths for
connected QPs.

Signed-off-by: Dotan Barak <dotanb@mellanox.co.il>
Signed-off-by: Michael S. Tsirkin <mst@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/hw/mthca/mthca_qp.c | 57 ++++++++++++++++++++++------------
 1 file changed, 37 insertions(+), 20 deletions(-)

diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c
index ff2def3e9d..564b6d51c3 100644
--- a/drivers/infiniband/hw/mthca/mthca_qp.c
+++ b/drivers/infiniband/hw/mthca/mthca_qp.c
@@ -549,6 +549,25 @@ static __be32 get_hw_access_flags(struct mthca_qp *qp, struct ib_qp_attr *attr,
 	return cpu_to_be32(hw_access_flags);
 }
 
+static void mthca_path_set(struct ib_ah_attr *ah, struct mthca_qp_path *path)
+{
+	path->g_mylmc     = ah->src_path_bits & 0x7f;
+	path->rlid        = cpu_to_be16(ah->dlid);
+	path->static_rate = !!ah->static_rate;
+
+	if (ah->ah_flags & IB_AH_GRH) {
+		path->g_mylmc   |= 1 << 7;
+		path->mgid_index = ah->grh.sgid_index;
+		path->hop_limit  = ah->grh.hop_limit;
+		path->sl_tclass_flowlabel = 
+			cpu_to_be32((ah->sl << 28)                |
+				    (ah->grh.traffic_class << 20) | 
+				    (ah->grh.flow_label));
+		memcpy(path->rgid, ah->grh.dgid.raw, 16);
+	} else
+		path->sl_tclass_flowlabel = cpu_to_be32(ah->sl << 28);
+}
+
 int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask)
 {
 	struct mthca_dev *dev = to_mdev(ibqp->device);
@@ -712,28 +731,14 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask)
 	}
 
 	if (attr_mask & IB_QP_RNR_RETRY) {
-		qp_context->pri_path.rnr_retry = attr->rnr_retry << 5;
-		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RNR_RETRY);
+		qp_context->alt_path.rnr_retry = qp_context->pri_path.rnr_retry =
+			attr->rnr_retry << 5;
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RNR_RETRY | 
+							MTHCA_QP_OPTPAR_ALT_RNR_RETRY);
 	}
 
 	if (attr_mask & IB_QP_AV) {
-		qp_context->pri_path.g_mylmc     = attr->ah_attr.src_path_bits & 0x7f;
-		qp_context->pri_path.rlid        = cpu_to_be16(attr->ah_attr.dlid);
-		qp_context->pri_path.static_rate = !!attr->ah_attr.static_rate;
-		if (attr->ah_attr.ah_flags & IB_AH_GRH) {
-			qp_context->pri_path.g_mylmc |= 1 << 7;
-			qp_context->pri_path.mgid_index = attr->ah_attr.grh.sgid_index;
-			qp_context->pri_path.hop_limit = attr->ah_attr.grh.hop_limit;
-			qp_context->pri_path.sl_tclass_flowlabel =
-				cpu_to_be32((attr->ah_attr.sl << 28)                |
-					    (attr->ah_attr.grh.traffic_class << 20) |
-					    (attr->ah_attr.grh.flow_label));
-			memcpy(qp_context->pri_path.rgid,
-			       attr->ah_attr.grh.dgid.raw, 16);
-		} else {
-			qp_context->pri_path.sl_tclass_flowlabel =
-				cpu_to_be32(attr->ah_attr.sl << 28);
-		}
+		mthca_path_set(&attr->ah_attr, &qp_context->pri_path);
 		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PRIMARY_ADDR_PATH);
 	}
 
@@ -742,7 +747,19 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask)
 		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_ACK_TIMEOUT);
 	}
 
-	/* XXX alt_path */
+	if (attr_mask & IB_QP_ALT_PATH) {
+		if (attr->alt_port_num == 0 || attr->alt_port_num > dev->limits.num_ports) {
+			mthca_dbg(dev, "Alternate port number (%u) is invalid\n", 
+				attr->alt_port_num);
+			return -EINVAL;
+		}
+
+		mthca_path_set(&attr->alt_ah_attr, &qp_context->alt_path);
+		qp_context->alt_path.port_pkey |= cpu_to_be32(attr->alt_pkey_index | 
+							      attr->alt_port_num << 24);
+		qp_context->alt_path.ackto = attr->alt_timeout << 3;
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_ALT_ADDR_PATH);
+	}
 
 	/* leave rdd as 0 */
 	qp_context->pd         = cpu_to_be32(to_mpd(ibqp->pd)->pd_num);
-- 
cgit v1.2.2


From b4ca1a3f8ca24033d7b7ef595faef97d9f8b2326 Mon Sep 17 00:00:00 2001
From: Jack Morgenstein <jackm@mellanox.co.il>
Date: Fri, 6 Jan 2006 16:21:19 -0800
Subject: IB/uverbs: Fix reference counting on error paths

If an operation fails after incrementing an object's reference count,
then it should decrement the reference count on the error path.

Signed-off-by: Jack Morgenstein <jackm@mellanox.co.il>
Signed-off-by: Michael S. Tsirkin <mst@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/core/uverbs_cmd.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index a57d021d43..6985a57fa6 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -489,6 +489,7 @@ err_idr:
 
 err_unreg:
 	ib_dereg_mr(mr);
+	atomic_dec(&pd->usecnt);
 
 err_up:
 	up(&ib_uverbs_idr_mutex);
@@ -935,6 +936,11 @@ err_idr:
 
 err_destroy:
 	ib_destroy_qp(qp);
+	atomic_dec(&pd->usecnt);
+	atomic_dec(&attr.send_cq->usecnt);
+	atomic_dec(&attr.recv_cq->usecnt);
+	if (attr.srq)
+		atomic_dec(&attr.srq->usecnt);
 
 err_up:
 	up(&ib_uverbs_idr_mutex);
@@ -1729,6 +1735,7 @@ err_idr:
 
 err_destroy:
 	ib_destroy_srq(srq);
+	atomic_dec(&pd->usecnt);
 
 err_up:
 	up(&ib_uverbs_idr_mutex);
-- 
cgit v1.2.2


From ea5d4a6ad2bfd1006790666981645cab43d3afbd Mon Sep 17 00:00:00 2001
From: Ralph Campbell <ralphc@pathscale.com>
Date: Fri, 6 Jan 2006 16:24:45 -0800
Subject: IB/uverbs: set ah_flags when creating address handle

AH attribute's ah_flags need to be set according to the is_global flag
passed in from userspace.

Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/core/uverbs_cmd.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 6985a57fa6..12d6cc0a7f 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -1454,6 +1454,7 @@ ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file,
 	attr.sl 	       = cmd.attr.sl;
 	attr.src_path_bits     = cmd.attr.src_path_bits;
 	attr.static_rate       = cmd.attr.static_rate;
+	attr.ah_flags          = cmd.attr.is_global ? IB_AH_GRH : 0;
 	attr.port_num 	       = cmd.attr.port_num;
 	attr.grh.flow_label    = cmd.attr.grh.flow_label;
 	attr.grh.sgid_index    = cmd.attr.grh.sgid_index;
-- 
cgit v1.2.2


From ac4e7b35579de55db50d602a472858867808a9c3 Mon Sep 17 00:00:00 2001
From: Jack Morgenstein <jackm@mellanox.co.il>
Date: Fri, 6 Jan 2006 16:43:14 -0800
Subject: IB/uverbs: Release event file reference on ib_uverbs_create_cq()
 error

ib_uverbs_create_cq() should release the completion channel event file
if an error occurs after it looks it up.  Also, if userspace asks for
a completion channel and we don't find it, an error should be returned
instead of silently creating a CQ without a completion channel.

Signed-off-by: Jack Morgenstein <jackm@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/core/uverbs_cmd.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 12d6cc0a7f..a02c5a05c9 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -594,13 +594,18 @@ ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,
 	if (cmd.comp_vector >= file->device->num_comp_vectors)
 		return -EINVAL;
 
-	if (cmd.comp_channel >= 0)
-		ev_file = ib_uverbs_lookup_comp_file(cmd.comp_channel);
-
 	uobj = kmalloc(sizeof *uobj, GFP_KERNEL);
 	if (!uobj)
 		return -ENOMEM;
 
+	if (cmd.comp_channel >= 0) {
+		ev_file = ib_uverbs_lookup_comp_file(cmd.comp_channel);
+		if (!ev_file) {
+			ret = -EINVAL;
+			goto err;
+		}
+	}
+
 	uobj->uobject.user_handle   = cmd.user_handle;
 	uobj->uobject.context       = file->ucontext;
 	uobj->uverbs_file	    = file;
@@ -664,6 +669,8 @@ err_up:
 	ib_destroy_cq(cq);
 
 err:
+	if (ev_file)
+		ib_uverbs_release_ucq(file, ev_file, uobj);
 	kfree(uobj);
 	return ret;
 }
-- 
cgit v1.2.2


From 4f8448dfe8d3804fadad90c9b77494238b4a4eae Mon Sep 17 00:00:00 2001
From: Ralph Campbell <ralphc@pathscale.com>
Date: Fri, 6 Jan 2006 16:43:47 -0800
Subject: IB: Set GIDs correctly in ib_create_ah_from_wc()

ib_create_ah_from_wc() doesn't create the correct return address (AH)
when there is a GRH present (source & dest GIDs need to be swapped).

Signed-off-by: Ralph Campbell <ralphc@pathscale.com>
Signed-off-by: Sean Hefty <sean.hefty@intel.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/core/verbs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 4c15e11273..c857361be4 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -107,9 +107,9 @@ struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, struct ib_wc *wc,
 
 	if (wc->wc_flags & IB_WC_GRH) {
 		ah_attr.ah_flags = IB_AH_GRH;
-		ah_attr.grh.dgid = grh->dgid;
+		ah_attr.grh.dgid = grh->sgid;
 
-		ret = ib_find_cached_gid(pd->device, &grh->sgid, &port_num,
+		ret = ib_find_cached_gid(pd->device, &grh->dgid, &port_num,
 					 &gid_index);
 		if (ret)
 			return ERR_PTR(ret);
-- 
cgit v1.2.2


From aa0e4e4aea8d9e0a559a884336d728f0263063e0 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Fri, 6 Jan 2006 22:55:39 -0800
Subject: [DCCP]: ipv6.c needs net/ip6_checksum.c

Reported by Dave Jones.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dccp/ipv6.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index c609dc78f4..683250a05f 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -27,6 +27,7 @@
 #include <net/ipv6.h>
 #include <net/protocol.h>
 #include <net/transp_v6.h>
+#include <net/ip6_checksum.h>
 #include <net/xfrm.h>
 
 #include "dccp.h"
-- 
cgit v1.2.2


From ee2e6841b934d76cb944a3390bbea84da777d4fa Mon Sep 17 00:00:00 2001
From: Luiz Capitulino <lcapitulino@mandriva.com.br>
Date: Fri, 6 Jan 2006 22:59:43 -0800
Subject: [XFRM]: Fix sparse warning.

security/selinux/xfrm.c:155:10: warning: Using plain integer as NULL pointer

Signed-off-by: Luiz Capitulino <lcapitulino@mandriva.com.br>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 security/selinux/xfrm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/security/selinux/xfrm.c b/security/selinux/xfrm.c
index 5b7776504e..b2af7ca496 100644
--- a/security/selinux/xfrm.c
+++ b/security/selinux/xfrm.c
@@ -146,7 +146,7 @@ static int selinux_xfrm_sec_ctx_alloc(struct xfrm_sec_ctx **ctxp, struct xfrm_us
 	return rc;
 
 out:
-	*ctxp = 0;
+	*ctxp = NULL;
 	kfree(ctx);
 	return rc;
 }
-- 
cgit v1.2.2


From 16a6677fdf1d1194f688f8291b06fbaff248c353 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Fri, 6 Jan 2006 23:01:48 -0800
Subject: [XFRM]: Netfilter IPsec output hooks

Call netfilter hooks before IPsec transforms. Packets visit the
FORWARD/LOCAL_OUT and POST_ROUTING hook before the first encapsulation
and the LOCAL_OUT and POST_ROUTING hook before each following tunnel mode
transform.

Patch from Herbert Xu <herbert@gondor.apana.org.au>:

Move the loop from dst_output into xfrm4_output/xfrm6_output since they're
the only ones who need to it. xfrm{4,6}_output_one() processes the first SA
all subsequent transport mode SAs and is called in a loop that calls the
netfilter hooks between each two calls.

In order to avoid the tail call issue, I've added the inline function
nf_hook which is nf_hook_slow plus the empty list check.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter.h | 61 +++++++++++++++++++++++---------------
 include/net/dst.h         | 11 +------
 net/ipv4/xfrm4_output.c   | 71 +++++++++++++++++++++++++++++++++-----------
 net/ipv6/xfrm6_output.c   | 75 +++++++++++++++++++++++++++++++++++------------
 4 files changed, 148 insertions(+), 70 deletions(-)

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index be365e70ee..79bb977afe 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -168,6 +168,37 @@ void nf_log_packet(int pf,
 		   const struct net_device *out,
 		   struct nf_loginfo *li,
 		   const char *fmt, ...);
+
+int nf_hook_slow(int pf, unsigned int hook, struct sk_buff **pskb,
+		 struct net_device *indev, struct net_device *outdev,
+		 int (*okfn)(struct sk_buff *), int thresh);
+
+/**
+ *	nf_hook_thresh - call a netfilter hook
+ *	
+ *	Returns 1 if the hook has allowed the packet to pass.  The function
+ *	okfn must be invoked by the caller in this case.  Any other return
+ *	value indicates the packet has been consumed by the hook.
+ */
+static inline int nf_hook_thresh(int pf, unsigned int hook,
+				 struct sk_buff **pskb,
+				 struct net_device *indev,
+				 struct net_device *outdev,
+				 int (*okfn)(struct sk_buff *), int thresh)
+{
+#ifndef CONFIG_NETFILTER_DEBUG
+	if (list_empty(&nf_hooks[pf][hook]))
+		return 1;
+#endif
+	return nf_hook_slow(pf, hook, pskb, indev, outdev, okfn, thresh);
+}
+
+static inline int nf_hook(int pf, unsigned int hook, struct sk_buff **pskb,
+			  struct net_device *indev, struct net_device *outdev,
+			  int (*okfn)(struct sk_buff *))
+{
+	return nf_hook_thresh(pf, hook, pskb, indev, outdev, okfn, INT_MIN);
+}
                    
 /* Activate hook; either okfn or kfree_skb called, unless a hook
    returns NF_STOLEN (in which case, it's up to the hook to deal with
@@ -188,35 +219,17 @@ void nf_log_packet(int pf,
 
 /* This is gross, but inline doesn't cut it for avoiding the function
    call in fast path: gcc doesn't inline (needs value tracking?). --RR */
-#ifdef CONFIG_NETFILTER_DEBUG
-#define NF_HOOK(pf, hook, skb, indev, outdev, okfn)			       \
-({int __ret;								       \
-if ((__ret=nf_hook_slow(pf, hook, &(skb), indev, outdev, okfn, INT_MIN)) == 1) \
-	__ret = (okfn)(skb);						       \
-__ret;})
-#define NF_HOOK_THRESH(pf, hook, skb, indev, outdev, okfn, thresh)	       \
-({int __ret;								       \
-if ((__ret=nf_hook_slow(pf, hook, &(skb), indev, outdev, okfn, thresh)) == 1)  \
-	__ret = (okfn)(skb);						       \
-__ret;})
-#else
-#define NF_HOOK(pf, hook, skb, indev, outdev, okfn)			       \
-({int __ret;								       \
-if (list_empty(&nf_hooks[pf][hook]) ||					       \
-    (__ret=nf_hook_slow(pf, hook, &(skb), indev, outdev, okfn, INT_MIN)) == 1) \
-	__ret = (okfn)(skb);						       \
-__ret;})
+
+/* HX: It's slightly less gross now. */
+
 #define NF_HOOK_THRESH(pf, hook, skb, indev, outdev, okfn, thresh)	       \
 ({int __ret;								       \
-if (list_empty(&nf_hooks[pf][hook]) ||					       \
-    (__ret=nf_hook_slow(pf, hook, &(skb), indev, outdev, okfn, thresh)) == 1)  \
+if ((__ret=nf_hook_thresh(pf, hook, &(skb), indev, outdev, okfn, thresh)) == 1)\
 	__ret = (okfn)(skb);						       \
 __ret;})
-#endif
 
-int nf_hook_slow(int pf, unsigned int hook, struct sk_buff **pskb,
-		 struct net_device *indev, struct net_device *outdev,
-		 int (*okfn)(struct sk_buff *), int thresh);
+#define NF_HOOK(pf, hook, skb, indev, outdev, okfn) \
+	NF_HOOK_THRESH(pf, hook, skb, indev, outdev, okfn, INT_MIN)
 
 /* Call setsockopt() */
 int nf_setsockopt(struct sock *sk, int pf, int optval, char __user *opt, 
diff --git a/include/net/dst.h b/include/net/dst.h
index bee8b84d32..5161e89017 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -225,16 +225,7 @@ static inline void dst_set_expires(struct dst_entry *dst, int timeout)
 /* Output packet to network from transport.  */
 static inline int dst_output(struct sk_buff *skb)
 {
-	int err;
-
-	for (;;) {
-		err = skb->dst->output(skb);
-
-		if (likely(err == 0))
-			return err;
-		if (unlikely(err != NET_XMIT_BYPASS))
-			return err;
-	}
+	return skb->dst->output(skb);
 }
 
 /* Input packet from network to transport.  */
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index 66620a9594..51fabb8f7c 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -8,8 +8,10 @@
  * 2 of the License, or (at your option) any later version.
  */
 
+#include <linux/compiler.h>
 #include <linux/skbuff.h>
 #include <linux/spinlock.h>
+#include <linux/netfilter_ipv4.h>
 #include <net/inet_ecn.h>
 #include <net/ip.h>
 #include <net/xfrm.h>
@@ -95,7 +97,7 @@ out:
 	return ret;
 }
 
-int xfrm4_output(struct sk_buff *skb)
+static int xfrm4_output_one(struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb->dst;
 	struct xfrm_state *x = dst->xfrm;
@@ -113,27 +115,32 @@ int xfrm4_output(struct sk_buff *skb)
 			goto error_nolock;
 	}
 
-	spin_lock_bh(&x->lock);
-	err = xfrm_state_check(x, skb);
-	if (err)
-		goto error;
+	do {
+		spin_lock_bh(&x->lock);
+		err = xfrm_state_check(x, skb);
+		if (err)
+			goto error;
 
-	xfrm4_encap(skb);
+		xfrm4_encap(skb);
 
-	err = x->type->output(x, skb);
-	if (err)
-		goto error;
+		err = x->type->output(x, skb);
+		if (err)
+			goto error;
 
-	x->curlft.bytes += skb->len;
-	x->curlft.packets++;
+		x->curlft.bytes += skb->len;
+		x->curlft.packets++;
 
-	spin_unlock_bh(&x->lock);
+		spin_unlock_bh(&x->lock);
 	
-	if (!(skb->dst = dst_pop(dst))) {
-		err = -EHOSTUNREACH;
-		goto error_nolock;
-	}
-	err = NET_XMIT_BYPASS;
+		if (!(skb->dst = dst_pop(dst))) {
+			err = -EHOSTUNREACH;
+			goto error_nolock;
+		}
+		dst = skb->dst;
+		x = dst->xfrm;
+	} while (x && !x->props.mode);
+
+	err = 0;
 
 out_exit:
 	return err;
@@ -143,3 +150,33 @@ error_nolock:
 	kfree_skb(skb);
 	goto out_exit;
 }
+
+static int xfrm4_output_finish(struct sk_buff *skb)
+{
+	int err;
+
+	while (likely((err = xfrm4_output_one(skb)) == 0)) {
+		nf_reset(skb);
+
+		err = nf_hook(PF_INET, NF_IP_LOCAL_OUT, &skb, NULL,
+			      skb->dst->dev, dst_output);
+		if (unlikely(err != 1))
+			break;
+
+		if (!skb->dst->xfrm)
+			return dst_output(skb);
+
+		err = nf_hook(PF_INET, NF_IP_POST_ROUTING, &skb, NULL,
+			      skb->dst->dev, xfrm4_output_finish);
+		if (unlikely(err != 1))
+			break;
+	}
+
+	return err;
+}
+
+int xfrm4_output(struct sk_buff *skb)
+{
+	return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, skb->dst->dev,
+		       xfrm4_output_finish);
+}
diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c
index 6b9867717d..fc0ea38953 100644
--- a/net/ipv6/xfrm6_output.c
+++ b/net/ipv6/xfrm6_output.c
@@ -9,9 +9,11 @@
  * 2 of the License, or (at your option) any later version.
  */
 
+#include <linux/compiler.h>
 #include <linux/skbuff.h>
 #include <linux/spinlock.h>
 #include <linux/icmpv6.h>
+#include <linux/netfilter_ipv6.h>
 #include <net/dsfield.h>
 #include <net/inet_ecn.h>
 #include <net/ipv6.h>
@@ -92,7 +94,7 @@ static int xfrm6_tunnel_check_size(struct sk_buff *skb)
 	return ret;
 }
 
-int xfrm6_output(struct sk_buff *skb)
+static int xfrm6_output_one(struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb->dst;
 	struct xfrm_state *x = dst->xfrm;
@@ -110,29 +112,34 @@ int xfrm6_output(struct sk_buff *skb)
 			goto error_nolock;
 	}
 
-	spin_lock_bh(&x->lock);
-	err = xfrm_state_check(x, skb);
-	if (err)
-		goto error;
+	do {
+		spin_lock_bh(&x->lock);
+		err = xfrm_state_check(x, skb);
+		if (err)
+			goto error;
 
-	xfrm6_encap(skb);
+		xfrm6_encap(skb);
 
-	err = x->type->output(x, skb);
-	if (err)
-		goto error;
+		err = x->type->output(x, skb);
+		if (err)
+			goto error;
 
-	x->curlft.bytes += skb->len;
-	x->curlft.packets++;
+		x->curlft.bytes += skb->len;
+		x->curlft.packets++;
 
-	spin_unlock_bh(&x->lock);
+		spin_unlock_bh(&x->lock);
 
-	skb->nh.raw = skb->data;
-	
-	if (!(skb->dst = dst_pop(dst))) {
-		err = -EHOSTUNREACH;
-		goto error_nolock;
-	}
-	err = NET_XMIT_BYPASS;
+		skb->nh.raw = skb->data;
+		
+		if (!(skb->dst = dst_pop(dst))) {
+			err = -EHOSTUNREACH;
+			goto error_nolock;
+		}
+		dst = skb->dst;
+		x = dst->xfrm;
+	} while (x && !x->props.mode);
+
+	err = 0;
 
 out_exit:
 	return err;
@@ -142,3 +149,33 @@ error_nolock:
 	kfree_skb(skb);
 	goto out_exit;
 }
+
+static int xfrm6_output_finish(struct sk_buff *skb)
+{
+	int err;
+
+	while (likely((err = xfrm6_output_one(skb)) == 0)) {
+		nf_reset(skb);
+	
+		err = nf_hook(PF_INET6, NF_IP6_LOCAL_OUT, &skb, NULL,
+			      skb->dst->dev, dst_output);
+		if (unlikely(err != 1))
+			break;
+
+		if (!skb->dst->xfrm)
+			return dst_output(skb);
+
+		err = nf_hook(PF_INET6, NF_IP6_POST_ROUTING, &skb, NULL,
+			      skb->dst->dev, xfrm6_output_finish);
+		if (unlikely(err != 1))
+			break;
+	}
+
+	return err;
+}
+
+int xfrm6_output(struct sk_buff *skb)
+{
+	return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb, NULL, skb->dst->dev,
+		       xfrm6_output_finish);
+}
-- 
cgit v1.2.2


From 951dbc8ac714b04c36296b8b5c36c8e036ce433f Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Fri, 6 Jan 2006 23:02:34 -0800
Subject: [IPV6]: Move nextheader offset to the IP6CB

Move nextheader offset to the IP6CB to make it possible to pass a
packet to ip6_input_finish multiple times and have it skip already
parsed headers. As a nice side effect this gets rid of the manual
hopopts skipping in ip6_input_finish.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h    |  1 +
 include/net/protocol.h  |  2 +-
 include/net/xfrm.h      |  6 +++---
 net/dccp/ipv6.c         |  2 +-
 net/ipv6/exthdrs.c      | 19 ++++++++++++-------
 net/ipv6/icmp.c         |  4 ++--
 net/ipv6/ip6_input.c    | 21 ++++++---------------
 net/ipv6/ip6_tunnel.c   |  2 +-
 net/ipv6/reassembly.c   | 11 +++++------
 net/ipv6/tcp_ipv6.c     |  2 +-
 net/ipv6/udp.c          |  2 +-
 net/ipv6/xfrm6_input.c  |  8 ++++----
 net/ipv6/xfrm6_tunnel.c |  6 +++---
 net/sctp/ipv6.c         |  2 +-
 14 files changed, 42 insertions(+), 46 deletions(-)

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 93bbed5c6c..5cfc715295 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -191,6 +191,7 @@ struct inet6_skb_parm {
 	__u16			srcrt;
 	__u16			dst1;
 	__u16			lastopt;
+	__u32			nhoff;
 };
 
 #define IP6CB(skb)	((struct inet6_skb_parm*)((skb)->cb))
diff --git a/include/net/protocol.h b/include/net/protocol.h
index 63f7db99c2..6dc5970612 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -43,7 +43,7 @@ struct net_protocol {
 #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
 struct inet6_protocol 
 {
-	int	(*handler)(struct sk_buff **skb, unsigned int *nhoffp);
+	int	(*handler)(struct sk_buff **skb);
 
 	void	(*err_handler)(struct sk_buff *skb,
 			       struct inet6_skb_parm *opt,
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 07d7b50cdd..297d09d28f 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -831,7 +831,7 @@ struct xfrm_tunnel {
 };
 
 struct xfrm6_tunnel {
-	int (*handler)(struct sk_buff **pskb, unsigned int *nhoffp);
+	int (*handler)(struct sk_buff **pskb);
 	void (*err_handler)(struct sk_buff *skb, struct inet6_skb_parm *opt,
 			    int type, int code, int offset, __u32 info);
 };
@@ -868,8 +868,8 @@ extern int xfrm4_rcv(struct sk_buff *skb);
 extern int xfrm4_output(struct sk_buff *skb);
 extern int xfrm4_tunnel_register(struct xfrm_tunnel *handler);
 extern int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler);
-extern int xfrm6_rcv_spi(struct sk_buff **pskb, unsigned int *nhoffp, u32 spi);
-extern int xfrm6_rcv(struct sk_buff **pskb, unsigned int *nhoffp);
+extern int xfrm6_rcv_spi(struct sk_buff **pskb, u32 spi);
+extern int xfrm6_rcv(struct sk_buff **pskb);
 extern int xfrm6_tunnel_register(struct xfrm6_tunnel *handler);
 extern int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler);
 extern u32 xfrm6_tunnel_alloc_spi(xfrm_address_t *saddr);
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 683250a05f..df074259f9 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -1029,7 +1029,7 @@ discard:
 	return 0;
 }
 
-static int dccp_v6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
+static int dccp_v6_rcv(struct sk_buff **pskb)
 {
 	const struct dccp_hdr *dh;
 	struct sk_buff *skb = *pskb;
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index 113374dc34..2a1e7e45b8 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -152,7 +152,7 @@ static struct tlvtype_proc tlvprocdestopt_lst[] = {
 	{-1,			NULL}
 };
 
-static int ipv6_destopt_rcv(struct sk_buff **skbp, unsigned int *nhoffp)
+static int ipv6_destopt_rcv(struct sk_buff **skbp)
 {
 	struct sk_buff *skb = *skbp;
 	struct inet6_skb_parm *opt = IP6CB(skb);
@@ -169,7 +169,7 @@ static int ipv6_destopt_rcv(struct sk_buff **skbp, unsigned int *nhoffp)
 
 	if (ip6_parse_tlv(tlvprocdestopt_lst, skb)) {
 		skb->h.raw += ((skb->h.raw[1]+1)<<3);
-		*nhoffp = opt->dst1;
+		opt->nhoff = opt->dst1;
 		return 1;
 	}
 
@@ -192,7 +192,7 @@ void __init ipv6_destopt_init(void)
   NONE header. No data in packet.
  ********************************/
 
-static int ipv6_nodata_rcv(struct sk_buff **skbp, unsigned int *nhoffp)
+static int ipv6_nodata_rcv(struct sk_buff **skbp)
 {
 	struct sk_buff *skb = *skbp;
 
@@ -215,7 +215,7 @@ void __init ipv6_nodata_init(void)
   Routing header.
  ********************************/
 
-static int ipv6_rthdr_rcv(struct sk_buff **skbp, unsigned int *nhoffp)
+static int ipv6_rthdr_rcv(struct sk_buff **skbp)
 {
 	struct sk_buff *skb = *skbp;
 	struct inet6_skb_parm *opt = IP6CB(skb);
@@ -249,7 +249,7 @@ looped_back:
 		skb->h.raw += (hdr->hdrlen + 1) << 3;
 		opt->dst0 = opt->dst1;
 		opt->dst1 = 0;
-		*nhoffp = (&hdr->nexthdr) - skb->nh.raw;
+		opt->nhoff = (&hdr->nexthdr) - skb->nh.raw;
 		return 1;
 	}
 
@@ -487,9 +487,14 @@ static struct tlvtype_proc tlvprochopopt_lst[] = {
 
 int ipv6_parse_hopopts(struct sk_buff *skb, int nhoff)
 {
-	IP6CB(skb)->hop = sizeof(struct ipv6hdr);
-	if (ip6_parse_tlv(tlvprochopopt_lst, skb))
+	struct inet6_skb_parm *opt = IP6CB(skb);
+
+	opt->hop = sizeof(struct ipv6hdr);
+	if (ip6_parse_tlv(tlvprochopopt_lst, skb)) {
+		skb->h.raw += (skb->h.raw[1]+1)<<3;
+		opt->nhoff = sizeof(struct ipv6hdr);
 		return sizeof(struct ipv6hdr);
+	}
 	return -1;
 }
 
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 6ec6a2b549..53c81fcd20 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -79,7 +79,7 @@ DEFINE_SNMP_STAT(struct icmpv6_mib, icmpv6_statistics) __read_mostly;
 static DEFINE_PER_CPU(struct socket *, __icmpv6_socket) = NULL;
 #define icmpv6_socket	__get_cpu_var(__icmpv6_socket)
 
-static int icmpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp);
+static int icmpv6_rcv(struct sk_buff **pskb);
 
 static struct inet6_protocol icmpv6_protocol = {
 	.handler	=	icmpv6_rcv,
@@ -581,7 +581,7 @@ static void icmpv6_notify(struct sk_buff *skb, int type, int code, u32 info)
  *	Handle icmp messages
  */
 
-static int icmpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
+static int icmpv6_rcv(struct sk_buff **pskb)
 {
 	struct sk_buff *skb = *pskb;
 	struct net_device *dev = skb->dev;
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index a6026d2787..13d724150f 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -97,6 +97,9 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
 	if (hdr->version != 6)
 		goto err;
 
+	skb->h.raw = (u8 *)(hdr + 1);
+	IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr);
+
 	pkt_len = ntohs(hdr->payload_len);
 
 	/* pkt_len may be zero if Jumbo payload option is present */
@@ -111,8 +114,7 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
 	}
 
 	if (hdr->nexthdr == NEXTHDR_HOP) {
-		skb->h.raw = (u8*)(hdr+1);
-		if (ipv6_parse_hopopts(skb, offsetof(struct ipv6hdr, nexthdr)) < 0) {
+		if (ipv6_parse_hopopts(skb, IP6CB(skb)->nhoff) < 0) {
 			IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
 			return 0;
 		}
@@ -143,26 +145,15 @@ static inline int ip6_input_finish(struct sk_buff *skb)
 	int nexthdr;
 	u8 hash;
 
-	skb->h.raw = skb->nh.raw + sizeof(struct ipv6hdr);
-
 	/*
 	 *	Parse extension headers
 	 */
 
-	nexthdr = skb->nh.ipv6h->nexthdr;
-	nhoff = offsetof(struct ipv6hdr, nexthdr);
-
-	/* Skip hop-by-hop options, they are already parsed. */
-	if (nexthdr == NEXTHDR_HOP) {
-		nhoff = sizeof(struct ipv6hdr);
-		nexthdr = skb->h.raw[0];
-		skb->h.raw += (skb->h.raw[1]+1)<<3;
-	}
-
 	rcu_read_lock();
 resubmit:
 	if (!pskb_pull(skb, skb->h.raw - skb->data))
 		goto discard;
+	nhoff = IP6CB(skb)->nhoff;
 	nexthdr = skb->nh.raw[nhoff];
 
 	raw_sk = sk_head(&raw_v6_htable[nexthdr & (MAX_INET_PROTOS - 1)]);
@@ -194,7 +185,7 @@ resubmit:
 		    !xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) 
 			goto discard;
 		
-		ret = ipprot->handler(&skb, &nhoff);
+		ret = ipprot->handler(&skb);
 		if (ret > 0)
 			goto resubmit;
 		else if (ret == 0)
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index e315d0f80a..f079621c8b 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -510,7 +510,7 @@ static inline void ip6ip6_ecn_decapsulate(struct ipv6hdr *outer_iph,
  **/
 
 static int 
-ip6ip6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
+ip6ip6_rcv(struct sk_buff **pskb)
 {
 	struct sk_buff *skb = *pskb;
 	struct ipv6hdr *ipv6h;
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 5d316cb72e..15e1456b3f 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -581,7 +581,6 @@ err:
  *	the last and the first frames arrived and all the bits are here.
  */
 static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff **skb_in,
-			  unsigned int *nhoffp,
 			  struct net_device *dev)
 {
 	struct sk_buff *fp, *head = fq->fragments;
@@ -654,6 +653,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff **skb_in,
 	head->dev = dev;
 	skb_set_timestamp(head, &fq->stamp);
 	head->nh.ipv6h->payload_len = htons(payload_len);
+	IP6CB(head)->nhoff = nhoff;
 
 	*skb_in = head;
 
@@ -663,7 +663,6 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff **skb_in,
 
 	IP6_INC_STATS_BH(IPSTATS_MIB_REASMOKS);
 	fq->fragments = NULL;
-	*nhoffp = nhoff;
 	return 1;
 
 out_oversize:
@@ -678,7 +677,7 @@ out_fail:
 	return -1;
 }
 
-static int ipv6_frag_rcv(struct sk_buff **skbp, unsigned int *nhoffp)
+static int ipv6_frag_rcv(struct sk_buff **skbp)
 {
 	struct sk_buff *skb = *skbp; 
 	struct net_device *dev = skb->dev;
@@ -710,7 +709,7 @@ static int ipv6_frag_rcv(struct sk_buff **skbp, unsigned int *nhoffp)
 		skb->h.raw += sizeof(struct frag_hdr);
 		IP6_INC_STATS_BH(IPSTATS_MIB_REASMOKS);
 
-		*nhoffp = (u8*)fhdr - skb->nh.raw;
+		IP6CB(skb)->nhoff = (u8*)fhdr - skb->nh.raw;
 		return 1;
 	}
 
@@ -722,11 +721,11 @@ static int ipv6_frag_rcv(struct sk_buff **skbp, unsigned int *nhoffp)
 
 		spin_lock(&fq->lock);
 
-		ip6_frag_queue(fq, skb, fhdr, *nhoffp);
+		ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff);
 
 		if (fq->last_in == (FIRST_IN|LAST_IN) &&
 		    fq->meat == fq->len)
-			ret = ip6_frag_reasm(fq, skbp, nhoffp, dev);
+			ret = ip6_frag_reasm(fq, skbp, dev);
 
 		spin_unlock(&fq->lock);
 		fq_put(fq, NULL);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 2947bc56d8..a25f4e8a8a 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1153,7 +1153,7 @@ ipv6_pktoptions:
 	return 0;
 }
 
-static int tcp_v6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
+static int tcp_v6_rcv(struct sk_buff **pskb)
 {
 	struct sk_buff *skb = *pskb;
 	struct tcphdr *th;	
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index d8538dcea8..c47648892c 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -435,7 +435,7 @@ out:
 	read_unlock(&udp_hash_lock);
 }
 
-static int udpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
+static int udpv6_rcv(struct sk_buff **pskb)
 {
 	struct sk_buff *skb = *pskb;
 	struct sock *sk;
diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
index 28c29d7833..1079e47f39 100644
--- a/net/ipv6/xfrm6_input.c
+++ b/net/ipv6/xfrm6_input.c
@@ -26,7 +26,7 @@ static inline void ipip6_ecn_decapsulate(struct sk_buff *skb)
 		IP6_ECN_set_ce(inner_iph);
 }
 
-int xfrm6_rcv_spi(struct sk_buff **pskb, unsigned int *nhoffp, u32 spi)
+int xfrm6_rcv_spi(struct sk_buff **pskb, u32 spi)
 {
 	struct sk_buff *skb = *pskb;
 	int err;
@@ -38,7 +38,7 @@ int xfrm6_rcv_spi(struct sk_buff **pskb, unsigned int *nhoffp, u32 spi)
 	int nexthdr;
 	unsigned int nhoff;
 
-	nhoff = *nhoffp;
+	nhoff = IP6CB(skb)->nhoff;
 	nexthdr = skb->nh.raw[nhoff];
 
 	seq = 0;
@@ -144,7 +144,7 @@ drop:
 
 EXPORT_SYMBOL(xfrm6_rcv_spi);
 
-int xfrm6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
+int xfrm6_rcv(struct sk_buff **pskb)
 {
-	return xfrm6_rcv_spi(pskb, nhoffp, 0);
+	return xfrm6_rcv_spi(pskb, 0);
 }
diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
index fbef7826a7..da09ff2586 100644
--- a/net/ipv6/xfrm6_tunnel.c
+++ b/net/ipv6/xfrm6_tunnel.c
@@ -397,7 +397,7 @@ int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler)
 
 EXPORT_SYMBOL(xfrm6_tunnel_deregister);
 
-static int xfrm6_tunnel_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
+static int xfrm6_tunnel_rcv(struct sk_buff **pskb)
 {
 	struct sk_buff *skb = *pskb;
 	struct xfrm6_tunnel *handler = xfrm6_tunnel_handler;
@@ -405,11 +405,11 @@ static int xfrm6_tunnel_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
 	u32 spi;
 
 	/* device-like_ip6ip6_handler() */
-	if (handler && handler->handler(pskb, nhoffp) == 0)
+	if (handler && handler->handler(pskb) == 0)
 		return 0;
 
 	spi = xfrm6_tunnel_spi_lookup((xfrm_address_t *)&iph->saddr);
-	return xfrm6_rcv_spi(pskb, nhoffp, spi);
+	return xfrm6_rcv_spi(pskb, spi);
 }
 
 static void xfrm6_tunnel_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 15c05165c9..04c7fab4ed 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -905,7 +905,7 @@ static struct inet_protosw sctpv6_stream_protosw = {
 	.flags         = SCTP_PROTOSW_FLAG,
 };
 
-static int sctp6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
+static int sctp6_rcv(struct sk_buff **pskb)
 {
 	return sctp_rcv(*pskb) ? -1 : 0;
 }
-- 
cgit v1.2.2


From b05e106698d9966de524e78d9da1bf6407fe0c32 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Fri, 6 Jan 2006 23:03:34 -0800
Subject: [IPV4/6]: Netfilter IPsec input hooks

When the innermost transform uses transport mode the decapsulated packet
is not visible to netfilter. Pass the packet through the PRE_ROUTING and
LOCAL_IN hooks again before handing it to upper layer protocols to make
netfilter-visibility symetrical to the output path.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6.h     |  2 ++
 net/ipv4/xfrm4_input.c | 31 +++++++++++++++++++++++++++++++
 net/ipv6/ip6_input.c   |  2 +-
 net/ipv6/xfrm6_input.c | 13 +++++++++++++
 4 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 860bbac4c4..3b1d963d39 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -418,6 +418,8 @@ extern int			ipv6_rcv(struct sk_buff *skb,
 					 struct packet_type *pt,
 					 struct net_device *orig_dev);
 
+extern int			ip6_rcv_finish(struct sk_buff *skb);
+
 /*
  *	upper-layer output functions
  */
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index 2d3849c38a..850d919591 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -11,6 +11,8 @@
 
 #include <linux/module.h>
 #include <linux/string.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
 #include <net/inet_ecn.h>
 #include <net/ip.h>
 #include <net/xfrm.h>
@@ -45,6 +47,23 @@ static int xfrm4_parse_spi(struct sk_buff *skb, u8 nexthdr, u32 *spi, u32 *seq)
 	return xfrm_parse_spi(skb, nexthdr, spi, seq);
 }
 
+#ifdef CONFIG_NETFILTER
+static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb)
+{
+	struct iphdr *iph = skb->nh.iph;
+
+	if (skb->dst == NULL) {
+		if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos,
+		                   skb->dev))
+			goto drop;
+	}
+	return dst_input(skb);
+drop:
+	kfree_skb(skb);
+	return NET_RX_DROP;
+}
+#endif
+
 int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type)
 {
 	int err;
@@ -137,6 +156,8 @@ int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type)
 	memcpy(skb->sp->x+skb->sp->len, xfrm_vec, xfrm_nr*sizeof(struct sec_decap_state));
 	skb->sp->len += xfrm_nr;
 
+	nf_reset(skb);
+
 	if (decaps) {
 		if (!(skb->dev->flags&IFF_LOOPBACK)) {
 			dst_release(skb->dst);
@@ -145,7 +166,17 @@ int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type)
 		netif_rx(skb);
 		return 0;
 	} else {
+#ifdef CONFIG_NETFILTER
+		__skb_push(skb, skb->data - skb->nh.raw);
+		skb->nh.iph->tot_len = htons(skb->len);
+		ip_send_check(skb->nh.iph);
+
+		NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, skb->dev, NULL,
+		        xfrm4_rcv_encap_finish);
+		return 0;
+#else
 		return -skb->nh.iph->protocol;
+#endif
 	}
 
 drop_unlock:
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 13d724150f..29f73592e6 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -48,7 +48,7 @@
 
 
-static inline int ip6_rcv_finish( struct sk_buff *skb) 
+inline int ip6_rcv_finish( struct sk_buff *skb) 
 {
 	if (skb->dst == NULL)
 		ip6_route_input(skb);
diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
index 1079e47f39..1ca2da68ef 100644
--- a/net/ipv6/xfrm6_input.c
+++ b/net/ipv6/xfrm6_input.c
@@ -11,6 +11,8 @@
 
 #include <linux/module.h>
 #include <linux/string.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
 #include <net/dsfield.h>
 #include <net/inet_ecn.h>
 #include <net/ip.h>
@@ -121,6 +123,8 @@ int xfrm6_rcv_spi(struct sk_buff **pskb, u32 spi)
 	skb->sp->len += xfrm_nr;
 	skb->ip_summed = CHECKSUM_NONE;
 
+	nf_reset(skb);
+
 	if (decaps) {
 		if (!(skb->dev->flags&IFF_LOOPBACK)) {
 			dst_release(skb->dst);
@@ -129,7 +133,16 @@ int xfrm6_rcv_spi(struct sk_buff **pskb, u32 spi)
 		netif_rx(skb);
 		return -1;
 	} else {
+#ifdef CONFIG_NETFILTER
+		skb->nh.ipv6h->payload_len = htons(skb->len);
+		__skb_push(skb, skb->data - skb->nh.raw);
+
+		NF_HOOK(PF_INET6, NF_IP6_PRE_ROUTING, skb, skb->dev, NULL,
+		        ip6_rcv_finish);
+		return -1;
+#else
 		return 1;
+#endif
 	}
 
 drop_unlock:
-- 
cgit v1.2.2


From 8cdfab8a43bb4b3da686ea503a702cb6f9f6a803 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Fri, 6 Jan 2006 23:04:01 -0800
Subject: [IPV4]: reset IPCB flags when neccessary

Reset IPSKB_XFRM_TUNNEL_SIZE flags in ipip and ip_gre hard_start_xmit
function before the packet reenters IP. This is neccessary so the
encapsulated packets are checked not to be oversized in xfrm4_output.c
again. Reset all flags in sit when a packet changes its address family.

Also remove some obsolete IPSKB flags.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h  | 8 +++-----
 net/ipv4/ip_gre.c | 1 +
 net/ipv4/ipip.c   | 1 +
 net/ipv6/sit.c    | 2 ++
 4 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/include/net/ip.h b/include/net/ip.h
index 7bb5804847..52f4d9c697 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -37,11 +37,9 @@ struct inet_skb_parm
 	struct ip_options	opt;		/* Compiled IP options		*/
 	unsigned char		flags;
 
-#define IPSKB_MASQUERADED	1
-#define IPSKB_TRANSLATED	2
-#define IPSKB_FORWARDED		4
-#define IPSKB_XFRM_TUNNEL_SIZE	8
-#define IPSKB_FRAG_COMPLETE	16
+#define IPSKB_FORWARDED		1
+#define IPSKB_XFRM_TUNNEL_SIZE	2
+#define IPSKB_FRAG_COMPLETE	4
 };
 
 struct ipcm_cookie
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 912c42f57c..65c3a91ed8 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -832,6 +832,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	skb->h.raw = skb->nh.raw;
 	skb->nh.raw = skb_push(skb, gre_hlen);
 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+	IPCB(skb)->flags &= ~IPSKB_XFRM_TUNNEL_SIZE;
 	dst_release(skb->dst);
 	skb->dst = &rt->u.dst;
 
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 35571cff81..078b59be91 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -621,6 +621,7 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	skb->h.raw = skb->nh.raw;
 	skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+	IPCB(skb)->flags &= ~IPSKB_XFRM_TUNNEL_SIZE;
 	dst_release(skb->dst);
 	skb->dst = &rt->u.dst;
 
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 577d49732b..02872ae8a4 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -381,6 +381,7 @@ static int ipip6_rcv(struct sk_buff *skb)
 		skb->mac.raw = skb->nh.raw;
 		skb->nh.raw = skb->data;
 		memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
+		IPCB(skb)->flags = 0;
 		skb->protocol = htons(ETH_P_IPV6);
 		skb->pkt_type = PACKET_HOST;
 		tunnel->stat.rx_packets++;
@@ -552,6 +553,7 @@ static int ipip6_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	skb->h.raw = skb->nh.raw;
 	skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+	IPCB(skb)->flags = 0;
 	dst_release(skb->dst);
 	skb->dst = &rt->u.dst;
 
-- 
cgit v1.2.2


From 3e3850e989c5d2eb1aab6f0fd9257759f0f4cbc6 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Fri, 6 Jan 2006 23:04:54 -0800
Subject: [NETFILTER]: Fix xfrm lookup in
 ip_route_me_harder/ip6_route_me_harder

ip_route_me_harder doesn't use the port numbers of the xfrm lookup and
uses ip_route_input for non-local addresses which doesn't do a xfrm
lookup, ip6_route_me_harder doesn't do a xfrm lookup at all.

Use xfrm_decode_session and do the lookup manually, make sure both
only do the lookup if the packet hasn't been transformed already.

Makeing sure the lookup only happens once needs a new field in the
IP6CB, which exceeds the size of skb->cb. The size of skb->cb is
increased to 48b. Apparently the IPv6 mobile extensions need some
more room anyway.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h    |  3 +++
 include/linux/skbuff.h  |  2 +-
 include/net/ip.h        |  3 ++-
 include/net/xfrm.h      |  2 +-
 net/ipv4/ip_gre.c       |  2 +-
 net/ipv4/ipip.c         |  2 +-
 net/ipv4/netfilter.c    | 12 ++++++++++--
 net/ipv4/xfrm4_output.c |  1 +
 net/ipv6/netfilter.c    |  9 ++++++++-
 net/ipv6/xfrm6_output.c |  1 +
 net/xfrm/xfrm_policy.c  |  9 +++++----
 11 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 5cfc715295..9c8f4c9ed4 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -192,6 +192,9 @@ struct inet6_skb_parm {
 	__u16			dst1;
 	__u16			lastopt;
 	__u32			nhoff;
+	__u16			flags;
+
+#define IP6SKB_XFRM_TRANSFORMED	1
 };
 
 #define IP6CB(skb)	((struct inet6_skb_parm*)((skb)->cb))
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 483cfc47ec..e5fd66c565 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -251,7 +251,7 @@ struct sk_buff {
 	 * want to keep them across layers you have to do a skb_clone()
 	 * first. This is owned by whoever has the skb queued ATM.
 	 */
-	char			cb[40];
+	char			cb[48];
 
 	unsigned int		len,
 				data_len,
diff --git a/include/net/ip.h b/include/net/ip.h
index 52f4d9c697..a494d04e5d 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -39,7 +39,8 @@ struct inet_skb_parm
 
 #define IPSKB_FORWARDED		1
 #define IPSKB_XFRM_TUNNEL_SIZE	2
-#define IPSKB_FRAG_COMPLETE	4
+#define IPSKB_XFRM_TRANSFORMED	4
+#define IPSKB_FRAG_COMPLETE	8
 };
 
 struct ipcm_cookie
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 297d09d28f..d6111a2f0a 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -668,7 +668,7 @@ static inline int xfrm6_policy_check(struct sock *sk, int dir, struct sk_buff *s
 	return xfrm_policy_check(sk, dir, skb, AF_INET6);
 }
 
-
+extern int xfrm_decode_session(struct sk_buff *skb, struct flowi *fl, unsigned short family);
 extern int __xfrm_route_forward(struct sk_buff *skb, unsigned short family);
 
 static inline int xfrm_route_forward(struct sk_buff *skb, unsigned short family)
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 65c3a91ed8..de16e94477 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -832,7 +832,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	skb->h.raw = skb->nh.raw;
 	skb->nh.raw = skb_push(skb, gre_hlen);
 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
-	IPCB(skb)->flags &= ~IPSKB_XFRM_TUNNEL_SIZE;
+	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE|IPSKB_XFRM_TRANSFORMED);
 	dst_release(skb->dst);
 	skb->dst = &rt->u.dst;
 
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 078b59be91..bbd85f5ec9 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -621,7 +621,7 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	skb->h.raw = skb->nh.raw;
 	skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
-	IPCB(skb)->flags &= ~IPSKB_XFRM_TUNNEL_SIZE;
+	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE|IPSKB_XFRM_TRANSFORMED);
 	dst_release(skb->dst);
 	skb->dst = &rt->u.dst;
 
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index ae0779d82c..4c637a1cbd 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -7,11 +7,13 @@
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv4.h>
 
+#include <linux/ip.h>
 #include <linux/tcp.h>
 #include <linux/udp.h>
 #include <linux/icmp.h>
 #include <net/route.h>
-#include <linux/ip.h>
+#include <net/xfrm.h>
+#include <net/ip.h>
 
 /* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
 int ip_route_me_harder(struct sk_buff **pskb)
@@ -33,7 +35,6 @@ int ip_route_me_harder(struct sk_buff **pskb)
 #ifdef CONFIG_IP_ROUTE_FWMARK
 		fl.nl_u.ip4_u.fwmark = (*pskb)->nfmark;
 #endif
-		fl.proto = iph->protocol;
 		if (ip_route_output_key(&rt, &fl) != 0)
 			return -1;
 
@@ -60,6 +61,13 @@ int ip_route_me_harder(struct sk_buff **pskb)
 	if ((*pskb)->dst->error)
 		return -1;
 
+#ifdef CONFIG_XFRM
+	if (!(IPCB(*pskb)->flags & IPSKB_XFRM_TRANSFORMED) &&
+	    xfrm_decode_session(*pskb, &fl, AF_INET) == 0)
+		if (xfrm_lookup(&(*pskb)->dst, &fl, (*pskb)->sk, 0))
+			return -1;
+#endif
+
 	/* Change in oif may mean change in hh_len. */
 	hh_len = (*pskb)->dst->dev->hard_header_len;
 	if (skb_headroom(*pskb) < hh_len) {
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index 51fabb8f7c..160c48800a 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -140,6 +140,7 @@ static int xfrm4_output_one(struct sk_buff *skb)
 		x = dst->xfrm;
 	} while (x && !x->props.mode);
 
+	IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED;
 	err = 0;
 
 out_exit:
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index f8626ebf90..b63678328a 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -10,6 +10,7 @@
 #include <net/dst.h>
 #include <net/ipv6.h>
 #include <net/ip6_route.h>
+#include <net/xfrm.h>
 
 int ip6_route_me_harder(struct sk_buff *skb)
 {
@@ -21,11 +22,17 @@ int ip6_route_me_harder(struct sk_buff *skb)
 		{ .ip6_u =
 		  { .daddr = iph->daddr,
 		    .saddr = iph->saddr, } },
-		.proto = iph->nexthdr,
 	};
 
 	dst = ip6_route_output(skb->sk, &fl);
 
+#ifdef CONFIG_XFRM
+	if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) &&
+	    xfrm_decode_session(skb, &fl, AF_INET6) == 0)
+		if (xfrm_lookup(&skb->dst, &fl, skb->sk, 0))
+			return -1;
+#endif
+
 	if (dst->error) {
 		IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
 		LIMIT_NETDEBUG(KERN_DEBUG "ip6_route_me_harder: No more route.\n");
diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c
index fc0ea38953..80242172a5 100644
--- a/net/ipv6/xfrm6_output.c
+++ b/net/ipv6/xfrm6_output.c
@@ -139,6 +139,7 @@ static int xfrm6_output_one(struct sk_buff *skb)
 		x = dst->xfrm;
 	} while (x && !x->props.mode);
 
+	IP6CB(skb)->flags |= IP6SKB_XFRM_TRANSFORMED;
 	err = 0;
 
 out_exit:
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 64a447375f..f2edc9225b 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -951,8 +951,8 @@ xfrm_policy_ok(struct xfrm_tmpl *tmpl, struct sec_path *sp, int start,
 	return start;
 }
 
-static int
-_decode_session(struct sk_buff *skb, struct flowi *fl, unsigned short family)
+int
+xfrm_decode_session(struct sk_buff *skb, struct flowi *fl, unsigned short family)
 {
 	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
 
@@ -963,6 +963,7 @@ _decode_session(struct sk_buff *skb, struct flowi *fl, unsigned short family)
 	xfrm_policy_put_afinfo(afinfo);
 	return 0;
 }
+EXPORT_SYMBOL(xfrm_decode_session);
 
 static inline int secpath_has_tunnel(struct sec_path *sp, int k)
 {
@@ -982,7 +983,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
 	u8 fl_dir = policy_to_flow_dir(dir);
 	u32 sk_sid;
 
-	if (_decode_session(skb, &fl, family) < 0)
+	if (xfrm_decode_session(skb, &fl, family) < 0)
 		return 0;
 
 	sk_sid = security_sk_sid(sk, &fl, fl_dir);
@@ -1055,7 +1056,7 @@ int __xfrm_route_forward(struct sk_buff *skb, unsigned short family)
 {
 	struct flowi fl;
 
-	if (_decode_session(skb, &fl, family) < 0)
+	if (xfrm_decode_session(skb, &fl, family) < 0)
 		return 0;
 
 	return xfrm_lookup(&skb->dst, &fl, NULL, 0) == 0;
-- 
cgit v1.2.2


From 4e8e9de7c25315669e2d5565acc50ec379522c28 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Fri, 6 Jan 2006 23:05:17 -0800
Subject: [NETFILTER]: Use conntrack information to determine if packet was
 NATed

Preparation for IPsec support for NAT:
Use conntrack information instead of saving the saving and comparing the
addresses to determine if a packet was NATed and needs to be rerouted to
make it easier to extend the key.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/ip_nat_standalone.c | 34 +++++++++++++++++++---------------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c
index f04111f74e..1bb50897a5 100644
--- a/net/ipv4/netfilter/ip_nat_standalone.c
+++ b/net/ipv4/netfilter/ip_nat_standalone.c
@@ -162,18 +162,20 @@ ip_nat_in(unsigned int hooknum,
           const struct net_device *out,
           int (*okfn)(struct sk_buff *))
 {
-	u_int32_t saddr, daddr;
+	struct ip_conntrack *ct;
+	enum ip_conntrack_info ctinfo;
 	unsigned int ret;
 
-	saddr = (*pskb)->nh.iph->saddr;
-	daddr = (*pskb)->nh.iph->daddr;
-
 	ret = ip_nat_fn(hooknum, pskb, in, out, okfn);
 	if (ret != NF_DROP && ret != NF_STOLEN
-	    && ((*pskb)->nh.iph->saddr != saddr
-	        || (*pskb)->nh.iph->daddr != daddr)) {
-		dst_release((*pskb)->dst);
-		(*pskb)->dst = NULL;
+	    && (ct = ip_conntrack_get(*pskb, &ctinfo)) != NULL) {
+		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+
+		if (ct->tuplehash[dir].tuple.src.ip !=
+		    ct->tuplehash[!dir].tuple.dst.ip) {
+			dst_release((*pskb)->dst);
+			(*pskb)->dst = NULL;
+		}
 	}
 	return ret;
 }
@@ -200,7 +202,8 @@ ip_nat_local_fn(unsigned int hooknum,
 		const struct net_device *out,
 		int (*okfn)(struct sk_buff *))
 {
-	u_int32_t saddr, daddr;
+	struct ip_conntrack *ct;
+	enum ip_conntrack_info ctinfo;
 	unsigned int ret;
 
 	/* root is playing with raw sockets. */
@@ -208,14 +211,15 @@ ip_nat_local_fn(unsigned int hooknum,
 	    || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr))
 		return NF_ACCEPT;
 
-	saddr = (*pskb)->nh.iph->saddr;
-	daddr = (*pskb)->nh.iph->daddr;
-
 	ret = ip_nat_fn(hooknum, pskb, in, out, okfn);
 	if (ret != NF_DROP && ret != NF_STOLEN
-	    && ((*pskb)->nh.iph->saddr != saddr
-		|| (*pskb)->nh.iph->daddr != daddr))
-		return ip_route_me_harder(pskb) == 0 ? ret : NF_DROP;
+	    && (ct = ip_conntrack_get(*pskb, &ctinfo)) != NULL) {
+		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+
+		if (ct->tuplehash[dir].tuple.dst.ip !=
+		    ct->tuplehash[!dir].tuple.src.ip)
+			return ip_route_me_harder(pskb) == 0 ? ret : NF_DROP;
+	}
 	return ret;
 }
 
-- 
cgit v1.2.2


From 5c901daaea3be0d900b3ae1fc9b5f64ff94e4f02 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Fri, 6 Jan 2006 23:05:36 -0800
Subject: [NETFILTER]: Redo policy lookups after NAT when neccessary

When NAT changes the key used for the xfrm lookup it needs to be done
again. If a new policy is returned in POST_ROUTING the packet needs
to be passed to xfrm4_output_one manually after all hooks were called
because POST_ROUTING is called with fixed okfn (ip_finish_output).

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/xfrm.h                     |  1 +
 net/ipv4/ip_output.c                   |  5 +++++
 net/ipv4/netfilter/ip_nat_standalone.c | 27 +++++++++++++++++++++++++--
 net/ipv4/xfrm4_output.c                |  2 +-
 4 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index d6111a2f0a..d09ca0e7d1 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -866,6 +866,7 @@ extern int xfrm_state_mtu(struct xfrm_state *x, int mtu);
 extern int xfrm_init_state(struct xfrm_state *x);
 extern int xfrm4_rcv(struct sk_buff *skb);
 extern int xfrm4_output(struct sk_buff *skb);
+extern int xfrm4_output_finish(struct sk_buff *skb);
 extern int xfrm4_tunnel_register(struct xfrm_tunnel *handler);
 extern int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler);
 extern int xfrm6_rcv_spi(struct sk_buff **pskb, u32 spi);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 8b1c9bd009..59fdac3a09 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -202,6 +202,11 @@ static inline int ip_finish_output2(struct sk_buff *skb)
 
 static inline int ip_finish_output(struct sk_buff *skb)
 {
+#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
+	/* Policy lookup after SNAT yielded a new policy */
+	if (skb->dst->xfrm != NULL)
+		return xfrm4_output_finish(skb);
+#endif
 	if (skb->len > dst_mtu(skb->dst) &&
 	    !(skb_shinfo(skb)->ufo_size || skb_shinfo(skb)->tso_size))
 		return ip_fragment(skb, ip_finish_output2);
diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c
index 1bb50897a5..b518697af4 100644
--- a/net/ipv4/netfilter/ip_nat_standalone.c
+++ b/net/ipv4/netfilter/ip_nat_standalone.c
@@ -187,12 +187,30 @@ ip_nat_out(unsigned int hooknum,
 	   const struct net_device *out,
 	   int (*okfn)(struct sk_buff *))
 {
+	struct ip_conntrack *ct;
+	enum ip_conntrack_info ctinfo;
+	unsigned int ret;
+
 	/* root is playing with raw sockets. */
 	if ((*pskb)->len < sizeof(struct iphdr)
 	    || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr))
 		return NF_ACCEPT;
 
-	return ip_nat_fn(hooknum, pskb, in, out, okfn);
+	ret = ip_nat_fn(hooknum, pskb, in, out, okfn);
+	if (ret != NF_DROP && ret != NF_STOLEN
+	    && (ct = ip_conntrack_get(*pskb, &ctinfo)) != NULL) {
+		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+
+		if (ct->tuplehash[dir].tuple.src.ip !=
+		    ct->tuplehash[!dir].tuple.dst.ip
+#ifdef CONFIG_XFRM
+		    || ct->tuplehash[dir].tuple.src.u.all !=
+		       ct->tuplehash[!dir].tuple.dst.u.all
+#endif
+		    )
+			return ip_route_me_harder(pskb) == 0 ? ret : NF_DROP;
+	}
+	return ret;
 }
 
 static unsigned int
@@ -217,7 +235,12 @@ ip_nat_local_fn(unsigned int hooknum,
 		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
 
 		if (ct->tuplehash[dir].tuple.dst.ip !=
-		    ct->tuplehash[!dir].tuple.src.ip)
+		    ct->tuplehash[!dir].tuple.src.ip
+#ifdef CONFIG_XFRM
+		    || ct->tuplehash[dir].tuple.dst.u.all !=
+		       ct->tuplehash[dir].tuple.src.u.all
+#endif
+		    )
 			return ip_route_me_harder(pskb) == 0 ? ret : NF_DROP;
 	}
 	return ret;
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index 160c48800a..d4df0ddd42 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -152,7 +152,7 @@ error_nolock:
 	goto out_exit;
 }
 
-static int xfrm4_output_finish(struct sk_buff *skb)
+int xfrm4_output_finish(struct sk_buff *skb)
 {
 	int err;
 
-- 
cgit v1.2.2


From b59c270104f03960069596722fea70340579244d Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Fri, 6 Jan 2006 23:06:10 -0800
Subject: [NETFILTER]: Keep conntrack reference until IPsec policy checks are
 done

Keep the conntrack reference until policy checks have been performed for
IPsec NAT support. The reference needs to be dropped before a packet is
queued to avoid having the conntrack module unloadable.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dccp/ipv4.c     |  1 +
 net/ipv4/ip_input.c | 15 ++++++---------
 net/ipv4/raw.c      |  1 +
 net/ipv4/tcp_ipv4.c |  1 +
 net/ipv4/udp.c      |  2 ++
 net/sctp/input.c    |  1 +
 6 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 3f24467076..23ba177c11 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -1099,6 +1099,7 @@ int dccp_v4_destroy_sock(struct sock *sk)
 		kfree_skb(sk->sk_send_head);
 		sk->sk_send_head = NULL;
 	}
+	nf_reset(skb);
 
 	/* Clean up a referenced DCCP bind bucket. */
 	if (inet_csk(sk)->icsk_bind_hash != NULL)
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index e45846ae57..18d7fad474 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -185,7 +185,6 @@ int ip_call_ra_chain(struct sk_buff *skb)
 					raw_rcv(last, skb2);
 			}
 			last = sk;
-			nf_reset(skb);
 		}
 	}
 
@@ -204,10 +203,6 @@ static inline int ip_local_deliver_finish(struct sk_buff *skb)
 
 	__skb_pull(skb, ihl);
 
-	/* Free reference early: we don't need it any more, and it may
-           hold ip_conntrack module loaded indefinitely. */
-	nf_reset(skb);
-
         /* Point into the IP datagram, just past the header. */
         skb->h.raw = skb->data;
 
@@ -232,10 +227,12 @@ static inline int ip_local_deliver_finish(struct sk_buff *skb)
 		if ((ipprot = rcu_dereference(inet_protos[hash])) != NULL) {
 			int ret;
 
-			if (!ipprot->no_policy &&
-			    !xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
-				kfree_skb(skb);
-				goto out;
+			if (!ipprot->no_policy) {
+				if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+					kfree_skb(skb);
+					goto out;
+				}
+				nf_reset(skb);
 			}
 			ret = ipprot->handler(skb);
 			if (ret < 0) {
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 4b0d7e4d62..165a4d81ef 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -255,6 +255,7 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb)
 		kfree_skb(skb);
 		return NET_RX_DROP;
 	}
+	nf_reset(skb);
 
 	skb_push(skb, skb->data - skb->nh.raw);
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index e9f83e5b28..6ea353907a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1080,6 +1080,7 @@ process:
 
 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
 		goto discard_and_relse;
+	nf_reset(skb);
 
 	if (sk_filter(sk, skb, 0))
 		goto discard_and_relse;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 223abaa72b..00840474a4 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -989,6 +989,7 @@ static int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
 		kfree_skb(skb);
 		return -1;
 	}
+	nf_reset(skb);
 
 	if (up->encap_type) {
 		/*
@@ -1149,6 +1150,7 @@ int udp_rcv(struct sk_buff *skb)
 
 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
 		goto drop;
+	nf_reset(skb);
 
 	/* No socket. Drop packet silently, if checksum is wrong */
 	if (udp_checksum_complete(skb))
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 238f1bffa6..4aa6fc6035 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -225,6 +225,7 @@ int sctp_rcv(struct sk_buff *skb)
 
 	if (!xfrm_policy_check(sk, XFRM_POLICY_IN, skb, family))
 		goto discard_release;
+	nf_reset(skb);
 
 	ret = sk_filter(sk, skb, 1);
 	if (ret)
-- 
cgit v1.2.2


From eb9c7ebe6980c41cf6ae889e301c3b49f473ee9f Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Fri, 6 Jan 2006 23:06:30 -0800
Subject: [NETFILTER]: Handle NAT in IPsec policy checks

Handle NAT of decapsulated IPsec packets by reconstructing the struct flowi
of the original packet from the conntrack information for IPsec policy
checks.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter.h              | 16 +++++++++++
 net/dccp/ipv4.c                        |  2 +-
 net/ipv4/netfilter.c                   |  3 ++
 net/ipv4/netfilter/ip_nat_standalone.c | 50 ++++++++++++++++++++++++++++++++--
 net/xfrm/xfrm_policy.c                 |  2 ++
 5 files changed, 70 insertions(+), 3 deletions(-)

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 79bb977afe..84506dfa1f 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -274,6 +274,20 @@ struct nf_queue_rerouter {
 extern int nf_register_queue_rerouter(int pf, struct nf_queue_rerouter *rer);
 extern int nf_unregister_queue_rerouter(int pf);
 
+#include <net/flow.h>
+extern void (*ip_nat_decode_session)(struct sk_buff *, struct flowi *);
+
+static inline void
+nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, int family)
+{
+#ifdef CONFIG_IP_NF_NAT_NEEDED
+	void (*decodefn)(struct sk_buff *, struct flowi *);
+
+	if (family == AF_INET && (decodefn = ip_nat_decode_session) != NULL)
+		decodefn(skb, fl);
+#endif
+}
+
 #ifdef CONFIG_PROC_FS
 #include <linux/proc_fs.h>
 extern struct proc_dir_entry *proc_net_netfilter;
@@ -282,6 +296,8 @@ extern struct proc_dir_entry *proc_net_netfilter;
 #else /* !CONFIG_NETFILTER */
 #define NF_HOOK(pf, hook, skb, indev, outdev, okfn) (okfn)(skb)
 static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {}
+static inline void
+nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, int family) {}
 #endif /*CONFIG_NETFILTER*/
 
 #endif /*__KERNEL__*/
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 23ba177c11..00f9832266 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -986,6 +986,7 @@ int dccp_v4_rcv(struct sk_buff *skb)
 
 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
 		goto discard_and_relse;
+	nf_reset(skb);
 
 	return sk_receive_skb(sk, skb);
 
@@ -1099,7 +1100,6 @@ int dccp_v4_destroy_sock(struct sock *sk)
 		kfree_skb(sk->sk_send_head);
 		sk->sk_send_head = NULL;
 	}
-	nf_reset(skb);
 
 	/* Clean up a referenced DCCP bind bucket. */
 	if (inet_csk(sk)->icsk_bind_hash != NULL)
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index 4c637a1cbd..3321092b09 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -86,6 +86,9 @@ int ip_route_me_harder(struct sk_buff **pskb)
 }
 EXPORT_SYMBOL(ip_route_me_harder);
 
+void (*ip_nat_decode_session)(struct sk_buff *, struct flowi *);
+EXPORT_SYMBOL(ip_nat_decode_session);
+
 /*
  * Extra routing may needed on local out, as the QUEUE target never
  * returns control to the table.
diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c
index b518697af4..8b8a1f00bb 100644
--- a/net/ipv4/netfilter/ip_nat_standalone.c
+++ b/net/ipv4/netfilter/ip_nat_standalone.c
@@ -55,6 +55,44 @@
 			         : ((hooknum) == NF_IP_LOCAL_IN ? "LOCAL_IN"  \
 				    : "*ERROR*")))
 
+#ifdef CONFIG_XFRM
+static void nat_decode_session(struct sk_buff *skb, struct flowi *fl)
+{
+	struct ip_conntrack *ct;
+	struct ip_conntrack_tuple *t;
+	enum ip_conntrack_info ctinfo;
+	enum ip_conntrack_dir dir;
+	unsigned long statusbit;
+
+	ct = ip_conntrack_get(skb, &ctinfo);
+	if (ct == NULL)
+		return;
+	dir = CTINFO2DIR(ctinfo);
+	t = &ct->tuplehash[dir].tuple;
+
+	if (dir == IP_CT_DIR_ORIGINAL)
+		statusbit = IPS_DST_NAT;
+	else
+		statusbit = IPS_SRC_NAT;
+
+	if (ct->status & statusbit) {
+		fl->fl4_dst = t->dst.ip;
+		if (t->dst.protonum == IPPROTO_TCP ||
+		    t->dst.protonum == IPPROTO_UDP)
+			fl->fl_ip_dport = t->dst.u.tcp.port;
+	}
+
+	statusbit ^= IPS_NAT_MASK;
+
+	if (ct->status & statusbit) {
+		fl->fl4_src = t->src.ip;
+		if (t->dst.protonum == IPPROTO_TCP ||
+		    t->dst.protonum == IPPROTO_UDP)
+			fl->fl_ip_sport = t->src.u.tcp.port;
+	}
+}
+#endif
+		
 static unsigned int
 ip_nat_fn(unsigned int hooknum,
 	  struct sk_buff **pskb,
@@ -330,10 +368,14 @@ static int init_or_cleanup(int init)
 
 	if (!init) goto cleanup;
 
+#ifdef CONFIG_XFRM
+	BUG_ON(ip_nat_decode_session != NULL);
+	ip_nat_decode_session = nat_decode_session;
+#endif
 	ret = ip_nat_rule_init();
 	if (ret < 0) {
 		printk("ip_nat_init: can't setup rules.\n");
-		goto cleanup_nothing;
+		goto cleanup_decode_session;
 	}
 	ret = nf_register_hook(&ip_nat_in_ops);
 	if (ret < 0) {
@@ -381,7 +423,11 @@ static int init_or_cleanup(int init)
 	nf_unregister_hook(&ip_nat_in_ops);
  cleanup_rule_init:
 	ip_nat_rule_cleanup();
- cleanup_nothing:
+ cleanup_decode_session:
+#ifdef CONFIG_XFRM
+	ip_nat_decode_session = NULL;
+	synchronize_net();
+#endif
 	return ret;
 }
 
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index f2edc9225b..59614a994b 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -22,6 +22,7 @@
 #include <linux/workqueue.h>
 #include <linux/notifier.h>
 #include <linux/netdevice.h>
+#include <linux/netfilter.h>
 #include <linux/module.h>
 #include <net/xfrm.h>
 #include <net/ip.h>
@@ -985,6 +986,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
 
 	if (xfrm_decode_session(skb, &fl, family) < 0)
 		return 0;
+	nf_nat_decode_session(skb, &fl, family);
 
 	sk_sid = security_sk_sid(sk, &fl, fl_dir);
 
-- 
cgit v1.2.2


From e16a8f0b8c53312beb1d8b52e463aae79aa809c7 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Fri, 6 Jan 2006 23:06:48 -0800
Subject: [NETFILTER]: Add ipt_policy/ip6t_policy matches

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_ipv4/ipt_policy.h  |  52 +++++++++
 include/linux/netfilter_ipv6/ip6t_policy.h |  52 +++++++++
 net/ipv4/netfilter/Kconfig                 |  10 ++
 net/ipv4/netfilter/Makefile                |   1 +
 net/ipv4/netfilter/ipt_policy.c            | 170 ++++++++++++++++++++++++++++
 net/ipv6/netfilter/Kconfig                 |  10 ++
 net/ipv6/netfilter/Makefile                |   1 +
 net/ipv6/netfilter/ip6t_policy.c           | 175 +++++++++++++++++++++++++++++
 8 files changed, 471 insertions(+)
 create mode 100644 include/linux/netfilter_ipv4/ipt_policy.h
 create mode 100644 include/linux/netfilter_ipv6/ip6t_policy.h
 create mode 100644 net/ipv4/netfilter/ipt_policy.c
 create mode 100644 net/ipv6/netfilter/ip6t_policy.c

diff --git a/include/linux/netfilter_ipv4/ipt_policy.h b/include/linux/netfilter_ipv4/ipt_policy.h
new file mode 100644
index 0000000000..7fd1bec453
--- /dev/null
+++ b/include/linux/netfilter_ipv4/ipt_policy.h
@@ -0,0 +1,52 @@
+#ifndef _IPT_POLICY_H
+#define _IPT_POLICY_H
+
+#define IPT_POLICY_MAX_ELEM	4
+
+enum ipt_policy_flags
+{
+	IPT_POLICY_MATCH_IN	= 0x1,
+	IPT_POLICY_MATCH_OUT	= 0x2,
+	IPT_POLICY_MATCH_NONE	= 0x4,
+	IPT_POLICY_MATCH_STRICT	= 0x8,
+};
+
+enum ipt_policy_modes
+{
+	IPT_POLICY_MODE_TRANSPORT,
+	IPT_POLICY_MODE_TUNNEL
+};
+
+struct ipt_policy_spec
+{
+	u_int8_t	saddr:1,
+			daddr:1,
+			proto:1,
+			mode:1,
+			spi:1,
+			reqid:1;
+};
+
+struct ipt_policy_elem
+{
+	u_int32_t	saddr;
+	u_int32_t	smask;
+	u_int32_t	daddr;
+	u_int32_t	dmask;
+	u_int32_t	spi;
+	u_int32_t	reqid;
+	u_int8_t	proto;
+	u_int8_t	mode;
+
+	struct ipt_policy_spec	match;
+	struct ipt_policy_spec	invert;
+};
+
+struct ipt_policy_info
+{
+	struct ipt_policy_elem pol[IPT_POLICY_MAX_ELEM];
+	u_int16_t flags;
+	u_int16_t len;
+};
+
+#endif /* _IPT_POLICY_H */
diff --git a/include/linux/netfilter_ipv6/ip6t_policy.h b/include/linux/netfilter_ipv6/ip6t_policy.h
new file mode 100644
index 0000000000..5a93afcd2f
--- /dev/null
+++ b/include/linux/netfilter_ipv6/ip6t_policy.h
@@ -0,0 +1,52 @@
+#ifndef _IP6T_POLICY_H
+#define _IP6T_POLICY_H
+
+#define IP6T_POLICY_MAX_ELEM	4
+
+enum ip6t_policy_flags
+{
+	IP6T_POLICY_MATCH_IN		= 0x1,
+	IP6T_POLICY_MATCH_OUT		= 0x2,
+	IP6T_POLICY_MATCH_NONE		= 0x4,
+	IP6T_POLICY_MATCH_STRICT	= 0x8,
+};
+
+enum ip6t_policy_modes
+{
+	IP6T_POLICY_MODE_TRANSPORT,
+	IP6T_POLICY_MODE_TUNNEL
+};
+
+struct ip6t_policy_spec
+{
+	u_int8_t	saddr:1,
+			daddr:1,
+			proto:1,
+			mode:1,
+			spi:1,
+			reqid:1;
+};
+
+struct ip6t_policy_elem
+{
+	struct in6_addr	saddr;
+	struct in6_addr	smask;
+	struct in6_addr	daddr;
+	struct in6_addr	dmask;
+	u_int32_t	spi;
+	u_int32_t	reqid;
+	u_int8_t	proto;
+	u_int8_t	mode;
+
+	struct ip6t_policy_spec	match;
+	struct ip6t_policy_spec	invert;
+};
+
+struct ip6t_policy_info
+{
+	struct ip6t_policy_elem pol[IP6T_POLICY_MAX_ELEM];
+	u_int16_t flags;
+	u_int16_t len;
+};
+
+#endif /* _IP6T_POLICY_H */
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 88a60650e6..a9893ec03e 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -487,6 +487,16 @@ config IP_NF_MATCH_STRING
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+config IP_NF_MATCH_POLICY
+       tristate "IPsec policy match support"
+       depends on IP_NF_IPTABLES && XFRM
+       help
+         Policy matching allows you to match packets based on the
+         IPsec policy that was used during decapsulation/will
+         be used during encapsulation.
+
+         To compile it as a module, choose M here.  If unsure, say N.
+
 # `filter', generic and specific targets
 config IP_NF_FILTER
 	tristate "Packet filtering"
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index d0a447e520..549b01a648 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -72,6 +72,7 @@ obj-$(CONFIG_IP_NF_MATCH_TCPMSS) += ipt_tcpmss.o
 obj-$(CONFIG_IP_NF_MATCH_REALM) += ipt_realm.o
 obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o
 obj-$(CONFIG_IP_NF_MATCH_PHYSDEV) += ipt_physdev.o
+obj-$(CONFIG_IP_NF_MATCH_POLICY) += ipt_policy.o
 obj-$(CONFIG_IP_NF_MATCH_COMMENT) += ipt_comment.o
 obj-$(CONFIG_IP_NF_MATCH_STRING) += ipt_string.o
 
diff --git a/net/ipv4/netfilter/ipt_policy.c b/net/ipv4/netfilter/ipt_policy.c
new file mode 100644
index 0000000000..709debcc69
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_policy.c
@@ -0,0 +1,170 @@
+/* IP tables module for matching IPsec policy
+ *
+ * Copyright (c) 2004,2005 Patrick McHardy, <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <net/xfrm.h>
+
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_policy.h>
+
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("IPtables IPsec policy matching module");
+MODULE_LICENSE("GPL");
+
+
+static inline int
+match_xfrm_state(struct xfrm_state *x, const struct ipt_policy_elem *e)
+{
+#define MATCH(x,y)	(!e->match.x || ((e->x == (y)) ^ e->invert.x))
+
+	return MATCH(saddr, x->props.saddr.a4 & e->smask) &&
+	       MATCH(daddr, x->id.daddr.a4 & e->dmask) &&
+	       MATCH(proto, x->id.proto) &&
+	       MATCH(mode, x->props.mode) &&
+	       MATCH(spi, x->id.spi) &&
+	       MATCH(reqid, x->props.reqid);
+}
+
+static int
+match_policy_in(const struct sk_buff *skb, const struct ipt_policy_info *info)
+{
+	const struct ipt_policy_elem *e;
+	struct sec_path *sp = skb->sp;
+	int strict = info->flags & IPT_POLICY_MATCH_STRICT;
+	int i, pos;
+
+	if (sp == NULL)
+		return -1;
+	if (strict && info->len != sp->len)
+		return 0;
+
+	for (i = sp->len - 1; i >= 0; i--) {
+		pos = strict ? i - sp->len + 1 : 0;
+		if (pos >= info->len)
+			return 0;
+		e = &info->pol[pos];
+
+		if (match_xfrm_state(sp->x[i].xvec, e)) {
+			if (!strict)
+				return 1;
+		} else if (strict)
+			return 0;
+	}
+
+	return strict ? 1 : 0;
+}
+
+static int
+match_policy_out(const struct sk_buff *skb, const struct ipt_policy_info *info)
+{
+	const struct ipt_policy_elem *e;
+	struct dst_entry *dst = skb->dst;
+	int strict = info->flags & IPT_POLICY_MATCH_STRICT;
+	int i, pos;
+
+	if (dst->xfrm == NULL)
+		return -1;
+
+	for (i = 0; dst && dst->xfrm; dst = dst->child, i++) {
+		pos = strict ? i : 0;
+		if (pos >= info->len)
+			return 0;
+		e = &info->pol[pos];
+
+		if (match_xfrm_state(dst->xfrm, e)) {
+			if (!strict)
+				return 1;
+		} else if (strict)
+			return 0;
+	}
+
+	return strict ? 1 : 0;
+}
+
+static int match(const struct sk_buff *skb,
+                 const struct net_device *in,
+                 const struct net_device *out,
+                 const void *matchinfo, int offset, int *hotdrop)
+{
+	const struct ipt_policy_info *info = matchinfo;
+	int ret;
+
+	if (info->flags & IPT_POLICY_MATCH_IN)
+		ret = match_policy_in(skb, info);
+	else
+		ret = match_policy_out(skb, info);
+
+	if (ret < 0)
+		ret = info->flags & IPT_POLICY_MATCH_NONE ? 1 : 0;
+	else if (info->flags & IPT_POLICY_MATCH_NONE)
+		ret = 0;
+
+	return ret;
+}
+
+static int checkentry(const char *tablename, const struct ipt_ip *ip,
+                      void *matchinfo, unsigned int matchsize,
+                      unsigned int hook_mask)
+{
+	struct ipt_policy_info *info = matchinfo;
+
+	if (matchsize != IPT_ALIGN(sizeof(*info))) {
+		printk(KERN_ERR "ipt_policy: matchsize %u != %zu\n",
+		       matchsize, IPT_ALIGN(sizeof(*info)));
+		return 0;
+	}
+	if (!(info->flags & (IPT_POLICY_MATCH_IN|IPT_POLICY_MATCH_OUT))) {
+		printk(KERN_ERR "ipt_policy: neither incoming nor "
+		                "outgoing policy selected\n");
+		return 0;
+	}
+	if (hook_mask & (1 << NF_IP_PRE_ROUTING | 1 << NF_IP_LOCAL_IN)
+	    && info->flags & IPT_POLICY_MATCH_OUT) {
+		printk(KERN_ERR "ipt_policy: output policy not valid in "
+		                "PRE_ROUTING and INPUT\n");
+		return 0;
+	}
+	if (hook_mask & (1 << NF_IP_POST_ROUTING | 1 << NF_IP_LOCAL_OUT)
+	    && info->flags & IPT_POLICY_MATCH_IN) {
+		printk(KERN_ERR "ipt_policy: input policy not valid in "
+		                "POST_ROUTING and OUTPUT\n");
+		return 0;
+	}
+	if (info->len > IPT_POLICY_MAX_ELEM) {
+		printk(KERN_ERR "ipt_policy: too many policy elements\n");
+		return 0;
+	}
+
+	return 1;
+}
+
+static struct ipt_match policy_match = {
+	.name		= "policy",
+	.match		= match,
+	.checkentry 	= checkentry,
+	.me		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	return ipt_register_match(&policy_match);
+}
+
+static void __exit fini(void)
+{
+	ipt_unregister_match(&policy_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index 04912f9b35..105dd69ee9 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -179,6 +179,16 @@ config IP6_NF_MATCH_PHYSDEV
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+config IP6_NF_MATCH_POLICY
+	tristate "IPsec policy match support"
+	depends on IP6_NF_IPTABLES && XFRM
+	help
+	  Policy matching allows you to match packets based on the
+	  IPsec policy that was used during decapsulation/will
+	  be used during encapsulation.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
 # The targets
 config IP6_NF_FILTER
 	tristate "Packet filtering"
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index 9ab5b2ca1f..c0c809b426 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -13,6 +13,7 @@ obj-$(CONFIG_IP6_NF_MATCH_OPTS) += ip6t_hbh.o ip6t_dst.o
 obj-$(CONFIG_IP6_NF_MATCH_IPV6HEADER) += ip6t_ipv6header.o
 obj-$(CONFIG_IP6_NF_MATCH_FRAG) += ip6t_frag.o
 obj-$(CONFIG_IP6_NF_MATCH_AHESP) += ip6t_esp.o ip6t_ah.o
+obj-$(CONFIG_IP6_NF_MATCH_POLICY) += ip6t_policy.o
 obj-$(CONFIG_IP6_NF_MATCH_EUI64) += ip6t_eui64.o
 obj-$(CONFIG_IP6_NF_MATCH_MULTIPORT) += ip6t_multiport.o
 obj-$(CONFIG_IP6_NF_MATCH_OWNER) += ip6t_owner.o
diff --git a/net/ipv6/netfilter/ip6t_policy.c b/net/ipv6/netfilter/ip6t_policy.c
new file mode 100644
index 0000000000..13fedad48c
--- /dev/null
+++ b/net/ipv6/netfilter/ip6t_policy.c
@@ -0,0 +1,175 @@
+/* IP tables module for matching IPsec policy
+ *
+ * Copyright (c) 2004,2005 Patrick McHardy, <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <net/xfrm.h>
+
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter_ipv6/ip6t_policy.h>
+
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("IPtables IPsec policy matching module");
+MODULE_LICENSE("GPL");
+
+
+static inline int
+match_xfrm_state(struct xfrm_state *x, const struct ip6t_policy_elem *e)
+{
+#define MATCH_ADDR(x,y,z)	(!e->match.x || \
+				 ((ip6_masked_addrcmp((z), &e->x, &e->y)) == 0) ^ e->invert.x)
+#define MATCH(x,y)		(!e->match.x || ((e->x == (y)) ^ e->invert.x))
+	
+	return MATCH_ADDR(saddr, smask, (struct in6_addr *)&x->props.saddr.a6) &&
+	       MATCH_ADDR(daddr, dmask, (struct in6_addr *)&x->id.daddr.a6) &&
+	       MATCH(proto, x->id.proto) &&
+	       MATCH(mode, x->props.mode) &&
+	       MATCH(spi, x->id.spi) &&
+	       MATCH(reqid, x->props.reqid);
+}
+
+static int
+match_policy_in(const struct sk_buff *skb, const struct ip6t_policy_info *info)
+{
+	const struct ip6t_policy_elem *e;
+	struct sec_path *sp = skb->sp;
+	int strict = info->flags & IP6T_POLICY_MATCH_STRICT;
+	int i, pos;
+
+	if (sp == NULL)
+		return -1;
+	if (strict && info->len != sp->len)
+		return 0;
+
+	for (i = sp->len - 1; i >= 0; i--) {
+		pos = strict ? i - sp->len + 1 : 0;
+		if (pos >= info->len)
+			return 0;
+		e = &info->pol[pos];
+
+		if (match_xfrm_state(sp->x[i].xvec, e)) {
+			if (!strict)
+				return 1;
+		} else if (strict)
+			return 0;
+	}
+
+	return strict ? 1 : 0;
+}
+
+static int
+match_policy_out(const struct sk_buff *skb, const struct ip6t_policy_info *info)
+{
+	const struct ip6t_policy_elem *e;
+	struct dst_entry *dst = skb->dst;
+	int strict = info->flags & IP6T_POLICY_MATCH_STRICT;
+	int i, pos;
+
+	if (dst->xfrm == NULL)
+		return -1;
+
+	for (i = 0; dst && dst->xfrm; dst = dst->child, i++) {
+		pos = strict ? i : 0;
+		if (pos >= info->len)
+			return 0;
+		e = &info->pol[pos];
+
+		if (match_xfrm_state(dst->xfrm, e)) {
+			if (!strict)
+				return 1;
+		} else if (strict)
+			return 0;
+	}
+
+	return strict ? 1 : 0;
+}
+
+static int match(const struct sk_buff *skb,
+                 const struct net_device *in,
+                 const struct net_device *out,
+                 const void *matchinfo,
+		 int offset,
+		 unsigned int protoff,
+		 int *hotdrop)
+{
+	const struct ip6t_policy_info *info = matchinfo;
+	int ret;
+
+	if (info->flags & IP6T_POLICY_MATCH_IN)
+		ret = match_policy_in(skb, info);
+	else
+		ret = match_policy_out(skb, info);
+
+	if (ret < 0)
+		ret = info->flags & IP6T_POLICY_MATCH_NONE ? 1 : 0;
+	else if (info->flags & IP6T_POLICY_MATCH_NONE)
+		ret = 0;
+
+	return ret;
+}
+
+static int checkentry(const char *tablename, const struct ip6t_ip6 *ip,
+                      void *matchinfo, unsigned int matchsize,
+                      unsigned int hook_mask)
+{
+	struct ip6t_policy_info *info = matchinfo;
+
+	if (matchsize != IP6T_ALIGN(sizeof(*info))) {
+		printk(KERN_ERR "ip6t_policy: matchsize %u != %zu\n",
+		       matchsize, IP6T_ALIGN(sizeof(*info)));
+		return 0;
+	}
+	if (!(info->flags & (IP6T_POLICY_MATCH_IN|IP6T_POLICY_MATCH_OUT))) {
+		printk(KERN_ERR "ip6t_policy: neither incoming nor "
+		                "outgoing policy selected\n");
+		return 0;
+	}
+	if (hook_mask & (1 << NF_IP6_PRE_ROUTING | 1 << NF_IP6_LOCAL_IN)
+	    && info->flags & IP6T_POLICY_MATCH_OUT) {
+		printk(KERN_ERR "ip6t_policy: output policy not valid in "
+		                "PRE_ROUTING and INPUT\n");
+		return 0;
+	}
+	if (hook_mask & (1 << NF_IP6_POST_ROUTING | 1 << NF_IP6_LOCAL_OUT)
+	    && info->flags & IP6T_POLICY_MATCH_IN) {
+		printk(KERN_ERR "ip6t_policy: input policy not valid in "
+		                "POST_ROUTING and OUTPUT\n");
+		return 0;
+	}
+	if (info->len > IP6T_POLICY_MAX_ELEM) {
+		printk(KERN_ERR "ip6t_policy: too many policy elements\n");
+		return 0;
+	}
+
+	return 1;
+}
+
+static struct ip6t_match policy_match = {
+	.name		= "policy",
+	.match		= match,
+	.checkentry 	= checkentry,
+	.me		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	return ip6t_register_match(&policy_match);
+}
+
+static void __exit fini(void)
+{
+	ip6t_unregister_match(&policy_match);
+}
+
+module_init(init);
+module_exit(fini);
-- 
cgit v1.2.2


From b341387225832c392ed83b9f89d15668b033a106 Mon Sep 17 00:00:00 2001
From: Francois Romieu <romieu@fr.zoreil.com>
Date: Fri, 6 Jan 2006 23:08:42 -0800
Subject: [AX25/MKISS]: unbalanced spinlock_bh in ax_encaps()

The unlocking disappeared during commit
5793f4be23f0171b4999ca68a39a9157b44139f3.

Signed-off-by: Francois Romieu <romieu@fr.zoreil.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hamradio/mkiss.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/hamradio/mkiss.c b/drivers/net/hamradio/mkiss.c
index 3e9accf137..41b3d83c2a 100644
--- a/drivers/net/hamradio/mkiss.c
+++ b/drivers/net/hamradio/mkiss.c
@@ -524,6 +524,7 @@ static void ax_encaps(struct net_device *dev, unsigned char *icp, int len)
 	ax->dev->trans_start = jiffies;
 	ax->xleft = count - actual;
 	ax->xhead = ax->xbuff + actual;
+	spin_unlock_bh(&ax->buflock);
 }
 
 /* Encapsulate an AX.25 packet and kick it into a TTY queue. */
-- 
cgit v1.2.2


From da7bc6ee8ee3976696fc740065362ca442eb61c9 Mon Sep 17 00:00:00 2001
From: Joe Kappus <joecool1029@gmail.com>
Date: Fri, 6 Jan 2006 23:15:04 -0800
Subject: [NETFILTER]: ip_conntrack_proto_sctp.c needs linux/interrupt.h

Signed-off-by: Joe Kappus <joecool1029@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/ip_conntrack_proto_sctp.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
index 977fb59d45..0b25050981 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
@@ -16,6 +16,7 @@
 #include <linux/types.h>
 #include <linux/sched.h>
 #include <linux/timer.h>
+#include <linux/interrupt.h>
 #include <linux/netfilter.h>
 #include <linux/module.h>
 #include <linux/in.h>
-- 
cgit v1.2.2


From f53b61d8c385140fe7f09e0c9187ae813ee9f330 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Sat, 7 Jan 2006 12:50:27 -0800
Subject: [NETFILTER]: Add dummy nf_hook{_thresh}() when NETFILTER is disabled.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 84506dfa1f..4cf6088625 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -295,7 +295,22 @@ extern struct proc_dir_entry *proc_net_netfilter;
 
 #else /* !CONFIG_NETFILTER */
 #define NF_HOOK(pf, hook, skb, indev, outdev, okfn) (okfn)(skb)
+static inline int nf_hook_thresh(int pf, unsigned int hook,
+				 struct sk_buff **pskb,
+				 struct net_device *indev,
+				 struct net_device *outdev,
+				 int (*okfn)(struct sk_buff *), int thresh)
+{
+	return okfn(*pskb);
+}
+static inline int nf_hook(int pf, unsigned int hook, struct sk_buff **pskb,
+			  struct net_device *indev, struct net_device *outdev,
+			  int (*okfn)(struct sk_buff *))
+{
+	return okfn(*pskb);
+}
 static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {}
+struct flowi;
 static inline void
 nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, int family) {}
 #endif /*CONFIG_NETFILTER*/
-- 
cgit v1.2.2


From 97dc627fb3471664c72d0933790a90ba3f91e131 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Sat, 7 Jan 2006 13:23:39 -0800
Subject: [IPV4]: make ip_fragment() static

Since there's no longer any external user of ip_fragment() we can make
it static.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h     | 1 -
 net/ipv4/ip_output.c | 5 +++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/net/ip.h b/include/net/ip.h
index a494d04e5d..8de0697b36 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -94,7 +94,6 @@ extern int		ip_local_deliver(struct sk_buff *skb);
 extern int		ip_mr_input(struct sk_buff *skb);
 extern int		ip_output(struct sk_buff *skb);
 extern int		ip_mc_output(struct sk_buff *skb);
-extern int		ip_fragment(struct sk_buff *skb, int (*out)(struct sk_buff*));
 extern int		ip_do_nat(struct sk_buff *skb);
 extern void		ip_send_check(struct iphdr *ip);
 extern int		ip_queue_xmit(struct sk_buff *skb, int ipfragok);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 59fdac3a09..c2169b47dd 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -85,6 +85,8 @@
 
 int sysctl_ip_default_ttl = IPDEFTTL;
 
+static int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*));
+
 /* Generate a checksum for an outgoing IP datagram. */
 __inline__ void ip_send_check(struct iphdr *iph)
 {
@@ -414,7 +416,7 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
  *	single device frame, and queue such a frame for sending.
  */
 
-int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
+static int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
 {
 	struct iphdr *iph;
 	int raw = 0;
@@ -1396,7 +1398,6 @@ void __init ip_init(void)
 #endif
 }
 
-EXPORT_SYMBOL(ip_fragment);
 EXPORT_SYMBOL(ip_generic_getfrag);
 EXPORT_SYMBOL(ip_queue_xmit);
 EXPORT_SYMBOL(ip_send_check);
-- 
cgit v1.2.2


From 9f5336e21893fafd232a9a02cfa7588ad153889a Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Sat, 7 Jan 2006 13:24:25 -0800
Subject: [IPV6]: small cleanups

This patch contains the following cleanups:
- addrconf.c: make addrconf_dad_stop() static
- inet6_connection_sock.c should #include <net/inet6_connection_sock.h>
  for getting the prototypes of it's global functions

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c              | 2 +-
 net/ipv6/inet6_connection_sock.c | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 704fb73e6c..e53e421eee 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1228,7 +1228,7 @@ int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2)
 
 /* Gets referenced address, destroys ifaddr */
 
-void addrconf_dad_stop(struct inet6_ifaddr *ifp)
+static void addrconf_dad_stop(struct inet6_ifaddr *ifp)
 {
 	if (ifp->flags&IFA_F_PERMANENT) {
 		spin_lock_bh(&ifp->lock);
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index 792f90f0f9..f8f3a37a14 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -25,6 +25,7 @@
 #include <net/inet_hashtables.h>
 #include <net/ip6_route.h>
 #include <net/sock.h>
+#include <net/inet6_connection_sock.h>
 
 int inet6_csk_bind_conflict(const struct sock *sk,
 			    const struct inet_bind_bucket *tb)
-- 
cgit v1.2.2


From 84c2008af01132c4ca257ed9b595693c611df15d Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 8 Jan 2006 01:00:28 -0800
Subject: [PATCH] revert "mm: page_state fixes"

Hugh says:

page_alloc_cpu_notify() specifically contains code to

 		/* Add dead cpu's page_states to our own. */

which handles this more efficiently.

Cc: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fd47494cb9..0b98f428b0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1204,6 +1204,7 @@ static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
 	int cpu = 0;
 
 	memset(ret, 0, sizeof(*ret));
+	cpus_and(*cpumask, *cpumask, cpu_online_map);
 
 	cpu = first_cpu(*cpumask);
 	while (cpu < NR_CPUS) {
@@ -1256,7 +1257,7 @@ unsigned long read_page_state_offset(unsigned long offset)
 	unsigned long ret = 0;
 	int cpu;
 
-	for_each_cpu(cpu) {
+	for_each_online_cpu(cpu) {
 		unsigned long in;
 
 		in = (unsigned long)&per_cpu(page_states, cpu) + offset;
-- 
cgit v1.2.2


From 5998bf1ddb5fb236597190b2274d357add63fd7e Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 8 Jan 2006 01:00:29 -0800
Subject: [PATCH] asm-generic/atomic.h needs types.h

For BITS_PER_LONG

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-generic/atomic.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/asm-generic/atomic.h b/include/asm-generic/atomic.h
index e0a28b925e..0fada8f16d 100644
--- a/include/asm-generic/atomic.h
+++ b/include/asm-generic/atomic.h
@@ -8,6 +8,7 @@
  * edit all arch specific atomic.h files.
  */
 
+#include <asm/types.h>
 
 /*
  * Suppport for atomic_long_t
-- 
cgit v1.2.2


From 70c00ba0bbfb583e39b964ebd88620c18aa02c62 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <marcelo.tosatti@cyclades.com>
Date: Sun, 8 Jan 2006 01:00:29 -0800
Subject: [PATCH] small hp_sdc_rtc cleanup: use no_llseek

Use no_llseek function.

Signed-off-by: Marcelo Tosatti <marcelo.tosatti@cyclades.com>
Cc: "Brian S. Julin" <bri@calyx.com>
Acked-by: Vojtech Pavlik <vojtech@suse.cz>
Cc: Dmitry Torokhov <dtor_core@ameritech.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/input/misc/hp_sdc_rtc.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/drivers/input/misc/hp_sdc_rtc.c b/drivers/input/misc/hp_sdc_rtc.c
index 1cd7657f7e..1be963961c 100644
--- a/drivers/input/misc/hp_sdc_rtc.c
+++ b/drivers/input/misc/hp_sdc_rtc.c
@@ -60,8 +60,6 @@ static struct fasync_struct *hp_sdc_rtc_async_queue;
 
 static DECLARE_WAIT_QUEUE_HEAD(hp_sdc_rtc_wait);
 
-static loff_t hp_sdc_rtc_llseek(struct file *file, loff_t offset, int origin);
-
 static ssize_t hp_sdc_rtc_read(struct file *file, char *buf,
 			       size_t count, loff_t *ppos);
 
@@ -387,11 +385,6 @@ static int hp_sdc_rtc_set_i8042timer (struct timeval *setto, uint8_t setcmd)
 	return 0;
 }
 
-static loff_t hp_sdc_rtc_llseek(struct file *file, loff_t offset, int origin)
-{
-        return -ESPIPE;
-}
-
 static ssize_t hp_sdc_rtc_read(struct file *file, char *buf,
 			       size_t count, loff_t *ppos) {
 	ssize_t retval;
@@ -679,7 +672,7 @@ static int hp_sdc_rtc_ioctl(struct inode *inode, struct file *file,
 
 static struct file_operations hp_sdc_rtc_fops = {
         .owner =	THIS_MODULE,
-        .llseek =	hp_sdc_rtc_llseek,
+        .llseek =	no_llseek,
         .read =		hp_sdc_rtc_read,
         .poll =		hp_sdc_rtc_poll,
         .ioctl =	hp_sdc_rtc_ioctl,
-- 
cgit v1.2.2


From 4dab06fa7b6d07ee95c8bba5ddd0840633069159 Mon Sep 17 00:00:00 2001
From: Woody Suwalski <woodys@xandros.com>
Date: Sun, 8 Jan 2006 01:00:31 -0800
Subject: [PATCH] ARM Netwinder watchdog wdt977 update

Cleanup for the ARM-only watchdog driver wdt977.

This is probably the last update, since we want to merge with w83977f_wdt.
Jose Goncalves has ported this driver to i386, so probably we can iron out
configuration differences.

Signed-off-by: Woody Suwalski <woodys@xandros.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/watchdog/wdt977.c | 216 +++++++++++++++++++++++++++--------------
 1 file changed, 141 insertions(+), 75 deletions(-)

diff --git a/drivers/char/watchdog/wdt977.c b/drivers/char/watchdog/wdt977.c
index 44d49dfacb..3843900e94 100644
--- a/drivers/char/watchdog/wdt977.c
+++ b/drivers/char/watchdog/wdt977.c
@@ -1,5 +1,5 @@
 /*
- *	Wdt977	0.03:	A Watchdog Device for Netwinder W83977AF chip
+ *	Wdt977	0.04:	A Watchdog Device for Netwinder W83977AF chip
  *
  *	(c) Copyright 1998 Rebel.com (Woody Suwalski <woody@netwinder.org>)
  *
@@ -18,6 +18,8 @@
  *				    from minutes to seconds.
  *      07-Jul-2003 Daniele Bellucci: Audit return code of misc_register in
  *                                    nwwatchdog_init.
+ *      25-Oct-2005 Woody Suwalski: Convert addresses to #defs, add spinlocks
+ *				    remove limitiation to be used on Netwinders only
  */
 
 #include <linux/module.h>
@@ -28,6 +30,7 @@
 #include <linux/fs.h>
 #include <linux/miscdevice.h>
 #include <linux/init.h>
+#include <linux/ioport.h>
 #include <linux/watchdog.h>
 #include <linux/notifier.h>
 #include <linux/reboot.h>
@@ -37,8 +40,18 @@
 #include <asm/mach-types.h>
 #include <asm/uaccess.h>
 
-#define PFX "Wdt977: "
-#define WATCHDOG_MINOR	130
+#define WATCHDOG_VERSION  "0.04"
+#define WATCHDOG_NAME     "Wdt977"
+#define PFX WATCHDOG_NAME ": "
+#define DRIVER_VERSION    WATCHDOG_NAME " driver, v" WATCHDOG_VERSION "\n"
+
+#define IO_INDEX_PORT	0x370		/* on some systems it can be 0x3F0 */
+#define IO_DATA_PORT	(IO_INDEX_PORT+1)
+
+#define UNLOCK_DATA	0x87
+#define LOCK_DATA	0xAA
+#define DEVICE_REGISTER	0x07
+
 
 #define	DEFAULT_TIMEOUT	60			/* default timeout in seconds */
 
@@ -47,6 +60,7 @@ static	int timeoutM;				/* timeout in minutes */
 static	unsigned long timer_alive;
 static	int testmode;
 static	char expect_close;
+static	spinlock_t spinlock;
 
 module_param(timeout, int, 0);
 MODULE_PARM_DESC(timeout,"Watchdog timeout in seconds (60..15300), default=" __MODULE_STRING(DEFAULT_TIMEOUT) ")");
@@ -63,9 +77,13 @@ MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=CON
 
 static int wdt977_start(void)
 {
+	unsigned long flags;
+
+	spin_lock_irqsave(&spinlock, flags);
+
 	/* unlock the SuperIO chip */
-	outb(0x87,0x370);
-	outb(0x87,0x370);
+	outb_p(UNLOCK_DATA, IO_INDEX_PORT);
+	outb_p(UNLOCK_DATA, IO_INDEX_PORT);
 
 	/* select device Aux2 (device=8) and set watchdog regs F2, F3 and F4
 	 * F2 has the timeout in minutes
@@ -73,28 +91,29 @@ static int wdt977_start(void)
 	 *   at timeout, and to reset timer on kbd/mouse activity (not impl.)
 	 * F4 is used to just clear the TIMEOUT'ed state (bit 0)
 	 */
-	outb(0x07,0x370);
-	outb(0x08,0x371);
-	outb(0xF2,0x370);
-	outb(timeoutM,0x371);
-	outb(0xF3,0x370);
-	outb(0x00,0x371);	/* another setting is 0E for kbd/mouse/LED */
-	outb(0xF4,0x370);
-	outb(0x00,0x371);
+	outb_p(DEVICE_REGISTER, IO_INDEX_PORT);
+	outb_p(0x08, IO_DATA_PORT);
+	outb_p(0xF2, IO_INDEX_PORT);
+	outb_p(timeoutM, IO_DATA_PORT);
+	outb_p(0xF3, IO_INDEX_PORT);
+	outb_p(0x00, IO_DATA_PORT);	/* another setting is 0E for kbd/mouse/LED */
+	outb_p(0xF4, IO_INDEX_PORT);
+	outb_p(0x00, IO_DATA_PORT);
 
 	/* at last select device Aux1 (dev=7) and set GP16 as a watchdog output */
 	/* in test mode watch the bit 1 on F4 to indicate "triggered" */
 	if (!testmode)
 	{
-		outb(0x07,0x370);
-		outb(0x07,0x371);
-		outb(0xE6,0x370);
-		outb(0x08,0x371);
+		outb_p(DEVICE_REGISTER, IO_INDEX_PORT);
+		outb_p(0x07, IO_DATA_PORT);
+		outb_p(0xE6, IO_INDEX_PORT);
+		outb_p(0x08, IO_DATA_PORT);
 	}
 
 	/* lock the SuperIO chip */
-	outb(0xAA,0x370);
+	outb_p(LOCK_DATA, IO_INDEX_PORT);
 
+	spin_unlock_irqrestore(&spinlock, flags);
 	printk(KERN_INFO PFX "activated.\n");
 
 	return 0;
@@ -106,35 +125,39 @@ static int wdt977_start(void)
 
 static int wdt977_stop(void)
 {
+	unsigned long flags;
+	spin_lock_irqsave(&spinlock, flags);
+
 	/* unlock the SuperIO chip */
-	outb(0x87,0x370);
-	outb(0x87,0x370);
+	outb_p(UNLOCK_DATA, IO_INDEX_PORT);
+	outb_p(UNLOCK_DATA, IO_INDEX_PORT);
 
 	/* select device Aux2 (device=8) and set watchdog regs F2,F3 and F4
 	* F3 is reset to its default state
 	* F4 can clear the TIMEOUT'ed state (bit 0) - back to default
 	* We can not use GP17 as a PowerLed, as we use its usage as a RedLed
 	*/
-	outb(0x07,0x370);
-	outb(0x08,0x371);
-	outb(0xF2,0x370);
-	outb(0xFF,0x371);
-	outb(0xF3,0x370);
-	outb(0x00,0x371);
-	outb(0xF4,0x370);
-	outb(0x00,0x371);
-	outb(0xF2,0x370);
-	outb(0x00,0x371);
+	outb_p(DEVICE_REGISTER, IO_INDEX_PORT);
+	outb_p(0x08, IO_DATA_PORT);
+	outb_p(0xF2, IO_INDEX_PORT);
+	outb_p(0xFF, IO_DATA_PORT);
+	outb_p(0xF3, IO_INDEX_PORT);
+	outb_p(0x00, IO_DATA_PORT);
+	outb_p(0xF4, IO_INDEX_PORT);
+	outb_p(0x00, IO_DATA_PORT);
+	outb_p(0xF2, IO_INDEX_PORT);
+	outb_p(0x00, IO_DATA_PORT);
 
 	/* at last select device Aux1 (dev=7) and set GP16 as a watchdog output */
-	outb(0x07,0x370);
-	outb(0x07,0x371);
-	outb(0xE6,0x370);
-	outb(0x08,0x371);
+	outb_p(DEVICE_REGISTER, IO_INDEX_PORT);
+	outb_p(0x07, IO_DATA_PORT);
+	outb_p(0xE6, IO_INDEX_PORT);
+	outb_p(0x08, IO_DATA_PORT);
 
 	/* lock the SuperIO chip */
-	outb(0xAA,0x370);
+	outb_p(LOCK_DATA, IO_INDEX_PORT);
 
+	spin_unlock_irqrestore(&spinlock, flags);
 	printk(KERN_INFO PFX "shutdown.\n");
 
 	return 0;
@@ -147,19 +170,23 @@ static int wdt977_stop(void)
 
 static int wdt977_keepalive(void)
 {
+	unsigned long flags;
+	spin_lock_irqsave(&spinlock, flags);
+
 	/* unlock the SuperIO chip */
-	outb(0x87,0x370);
-	outb(0x87,0x370);
+	outb_p(UNLOCK_DATA, IO_INDEX_PORT);
+	outb_p(UNLOCK_DATA, IO_INDEX_PORT);
 
 	/* select device Aux2 (device=8) and kicks watchdog reg F2 */
 	/* F2 has the timeout in minutes */
-	outb(0x07,0x370);
-	outb(0x08,0x371);
-	outb(0xF2,0x370);
-	outb(timeoutM,0x371);
+	outb_p(DEVICE_REGISTER, IO_INDEX_PORT);
+	outb_p(0x08, IO_DATA_PORT);
+	outb_p(0xF2, IO_INDEX_PORT);
+	outb_p(timeoutM, IO_DATA_PORT);
 
 	/* lock the SuperIO chip */
-	outb(0xAA,0x370);
+	outb_p(LOCK_DATA, IO_INDEX_PORT);
+	spin_unlock_irqrestore(&spinlock, flags);
 
 	return 0;
 }
@@ -198,22 +225,26 @@ static int wdt977_set_timeout(int t)
 static int wdt977_get_status(int *status)
 {
 	int new_status;
+	unsigned long flags;
 
-	*status=0;
+	spin_lock_irqsave(&spinlock, flags);
 
 	/* unlock the SuperIO chip */
-	outb(0x87,0x370);
-	outb(0x87,0x370);
+	outb_p(UNLOCK_DATA, IO_INDEX_PORT);
+	outb_p(UNLOCK_DATA, IO_INDEX_PORT);
 
 	/* select device Aux2 (device=8) and read watchdog reg F4 */
-	outb(0x07,0x370);
-	outb(0x08,0x371);
-	outb(0xF4,0x370);
-	new_status = inb(0x371);
+	outb_p(DEVICE_REGISTER, IO_INDEX_PORT);
+	outb_p(0x08, IO_DATA_PORT);
+	outb_p(0xF4, IO_INDEX_PORT);
+	new_status = inb_p(IO_DATA_PORT);
 
 	/* lock the SuperIO chip */
-	outb(0xAA,0x370);
+	outb_p(LOCK_DATA, IO_INDEX_PORT);
 
+	spin_unlock_irqrestore(&spinlock, flags);
+
+	*status=0;
 	if (new_status & 1)
 		*status |= WDIOF_CARDRESET;
 
@@ -249,8 +280,8 @@ static int wdt977_release(struct inode *inode, struct file *file)
 		wdt977_stop();
 		clear_bit(0,&timer_alive);
 	} else {
-		printk(KERN_CRIT PFX "Unexpected close, not stopping watchdog!\n");
 		wdt977_keepalive();
+		printk(KERN_CRIT PFX "Unexpected close, not stopping watchdog!\n");
 	}
 	expect_close = 0;
 	return 0;
@@ -271,14 +302,17 @@ static int wdt977_release(struct inode *inode, struct file *file)
 static ssize_t wdt977_write(struct file *file, const char __user *buf,
 			    size_t count, loff_t *ppos)
 {
-	if (count) {
-		if (!nowayout) {
+	if (count)
+	{
+		if (!nowayout)
+		{
 			size_t i;
 
 			/* In case it was set long ago */
 			expect_close = 0;
 
-			for (i = 0; i != count; i++) {
+			for (i = 0; i != count; i++)
+			{
 				char c;
 				if (get_user(c, buf + i))
 					return -EFAULT;
@@ -287,6 +321,7 @@ static ssize_t wdt977_write(struct file *file, const char __user *buf,
 			}
 		}
 
+		/* someone wrote to us, we should restart timer */
 		wdt977_keepalive();
 	}
 	return count;
@@ -308,7 +343,7 @@ static struct watchdog_info ident = {
 				WDIOF_MAGICCLOSE |
 				WDIOF_KEEPALIVEPING,
 	.firmware_version =	1,
-	.identity =		"Winbond 83977",
+	.identity =		WATCHDOG_NAME,
 };
 
 static int wdt977_ioctl(struct inode *inode, struct file *file,
@@ -405,50 +440,81 @@ static struct notifier_block wdt977_notifier = {
 	.notifier_call = wdt977_notify_sys,
 };
 
-static int __init nwwatchdog_init(void)
+static int __init wd977_init(void)
 {
-	int retval;
-	if (!machine_is_netwinder())
-		return -ENODEV;
+	int rc;
+
+	//if (!machine_is_netwinder())
+	//	return -ENODEV;
+
+	printk(KERN_INFO PFX DRIVER_VERSION);
+
+	spin_lock_init(&spinlock);
 
 	/* Check that the timeout value is within it's range ; if not reset to the default */
-	if (wdt977_set_timeout(timeout)) {
+	if (wdt977_set_timeout(timeout))
+	{
 		wdt977_set_timeout(DEFAULT_TIMEOUT);
 		printk(KERN_INFO PFX "timeout value must be 60<timeout<15300, using %d\n",
 			DEFAULT_TIMEOUT);
 	}
 
-	retval = register_reboot_notifier(&wdt977_notifier);
-	if (retval) {
-		printk(KERN_ERR PFX "cannot register reboot notifier (err=%d)\n",
-			retval);
-		return retval;
+	/* on Netwinder the IOports are already reserved by
+	 * arch/arm/mach-footbridge/netwinder-hw.c
+	 */
+	if (!machine_is_netwinder())
+	{
+		if (!request_region(IO_INDEX_PORT, 2, WATCHDOG_NAME))
+		{
+			printk(KERN_ERR PFX "I/O address 0x%04x already in use\n",
+				IO_INDEX_PORT);
+			rc = -EIO;
+			goto err_out;
+		}
 	}
 
-	retval = misc_register(&wdt977_miscdev);
-	if (retval) {
+	rc = misc_register(&wdt977_miscdev);
+	if (rc)
+	{
 		printk(KERN_ERR PFX "cannot register miscdev on minor=%d (err=%d)\n",
-			WATCHDOG_MINOR, retval);
-		unregister_reboot_notifier(&wdt977_notifier);
-		return retval;
+			wdt977_miscdev.minor, rc);
+		goto err_out_region;
+	}
+
+	rc = register_reboot_notifier(&wdt977_notifier);
+	if (rc)
+	{
+		printk(KERN_ERR PFX "cannot register reboot notifier (err=%d)\n",
+			rc);
+		goto err_out_miscdev;
 	}
 
-	printk(KERN_INFO PFX "initialized. timeout=%d sec (nowayout=%d, testmode = %i)\n",
+	printk(KERN_INFO PFX "initialized. timeout=%d sec (nowayout=%d, testmode=%i)\n",
 		timeout, nowayout, testmode);
 
 	return 0;
+
+err_out_miscdev:
+        misc_deregister(&wdt977_miscdev);
+err_out_region:
+	if (!machine_is_netwinder())
+	        release_region(IO_INDEX_PORT,2);
+err_out:
+	return rc;
 }
 
-static void __exit nwwatchdog_exit(void)
+static void __exit wd977_exit(void)
 {
+	wdt977_stop();
 	misc_deregister(&wdt977_miscdev);
 	unregister_reboot_notifier(&wdt977_notifier);
+	release_region(IO_INDEX_PORT,2);
 }
 
-module_init(nwwatchdog_init);
-module_exit(nwwatchdog_exit);
+module_init(wd977_init);
+module_exit(wd977_exit);
 
-MODULE_AUTHOR("Woody Suwalski <woody@netwinder.org>");
+MODULE_AUTHOR("Woody Suwalski <woodys@xandros.com>");
 MODULE_DESCRIPTION("W83977AF Watchdog driver");
 MODULE_LICENSE("GPL");
 MODULE_ALIAS_MISCDEV(WATCHDOG_MINOR);
-- 
cgit v1.2.2


From b792de39d892e06b18ddea85be076bae123d6bf6 Mon Sep 17 00:00:00 2001
From: Olaf Hering <olh@suse.de>
Date: Sun, 8 Jan 2006 01:00:32 -0800
Subject: [PATCH] Fix compilation with CONFIG_MEMORY_HOTPLUG=y and gcc41.

Fix compilation with CONFIG_MEMORY_HOTPLUG=y and gcc41.
Also remove unneeded declations, add a public function.

drivers/base/memory.c:53: error: static declaration of 'register_memory_notifier' follows non-static declaration
include/linux/memory.h:85: error: previous declaration of 'register_memory_notifier' was here
drivers/base/memory.c:58: error: static declaration of 'unregister_memory_notifier' follows non-static declaration
include/linux/memory.h:86: error: previous declaration of 'unregister_memory_notifier' was here
drivers/base/memory.c:68: error: static declaration of 'register_memory' follows non-static declaration
include/linux/memory.h:73: error: previous declaration of 'register_memory' was here

Signed-off-by: Olaf Hering <olh@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/memory.h | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/include/linux/memory.h b/include/linux/memory.h
index dc4081b6f1..e251dc43d0 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -70,21 +70,15 @@ static inline void unregister_memory_notifier(struct notifier_block *nb)
 {
 }
 #else
-extern int register_memory(struct memory_block *, struct mem_section *section, struct node *);
 extern int register_new_memory(struct mem_section *);
 extern int unregister_memory_section(struct mem_section *);
 extern int memory_dev_init(void);
-extern int register_memory_notifier(struct notifier_block *nb);
-extern void unregister_memory_notifier(struct notifier_block *nb);
+extern int remove_memory_block(unsigned long, struct mem_section *, int);
 
 #define CONFIG_MEM_BLOCK_SIZE	(PAGES_PER_SECTION<<PAGE_SHIFT)
 
-extern int invalidate_phys_mapping(unsigned long, unsigned long);
 struct notifier_block;
 
-extern int register_memory_notifier(struct notifier_block *nb);
-extern void unregister_memory_notifier(struct notifier_block *nb);
-
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 #define hotplug_memory_notifier(fn, pri) {			\
-- 
cgit v1.2.2


From f9f7500521b25dbf1aba476b81230489ad8e2c4b Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Sun, 8 Jan 2006 01:00:33 -0800
Subject: [PATCH] slab: remove unused align parameter from alloc_percpu

__alloc_percpu and alloc_percpu both take an 'align' argument which is
completely ignored.  snmp6_mib_init() in net/ipv6/af_inet6.c attempts to use
it, but it will be ignored.  Therefore, remove the 'align' argument and fixup
the lone caller.

Signed-off-by: Matthew Dobson <colpatch@us.ibm.com>
Acked-by: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/percpu.h | 7 +++----
 mm/slab.c              | 3 +--
 net/ipv6/af_inet6.c    | 4 ++--
 3 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index fb8d2d24e4..20317d88de 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -33,14 +33,14 @@ struct percpu_data {
         (__typeof__(ptr))__p->ptrs[(cpu)];	\
 })
 
-extern void *__alloc_percpu(size_t size, size_t align);
+extern void *__alloc_percpu(size_t size);
 extern void free_percpu(const void *);
 
 #else /* CONFIG_SMP */
 
 #define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); })
 
-static inline void *__alloc_percpu(size_t size, size_t align)
+static inline void *__alloc_percpu(size_t size)
 {
 	void *ret = kmalloc(size, GFP_KERNEL);
 	if (ret)
@@ -55,7 +55,6 @@ static inline void free_percpu(const void *ptr)
 #endif /* CONFIG_SMP */
 
 /* Simple wrapper for the common case: zeros memory. */
-#define alloc_percpu(type) \
-	((type *)(__alloc_percpu(sizeof(type), __alignof__(type))))
+#define alloc_percpu(type)	((type *)(__alloc_percpu(sizeof(type))))
 
 #endif /* __LINUX_PERCPU_H */
diff --git a/mm/slab.c b/mm/slab.c
index e5ec26e0c4..eb70fddf20 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2944,9 +2944,8 @@ EXPORT_SYMBOL(__kmalloc);
  * Objects should be dereferenced using the per_cpu_ptr macro only.
  *
  * @size: how many bytes of memory are required.
- * @align: the alignment, which can't be greater than SMP_CACHE_BYTES.
  */
-void *__alloc_percpu(size_t size, size_t align)
+void *__alloc_percpu(size_t size)
 {
 	int i;
 	struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL);
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 68afc53be6..25c3fe5005 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -689,11 +689,11 @@ snmp6_mib_init(void *ptr[2], size_t mibsize, size_t mibalign)
 	if (ptr == NULL)
 		return -EINVAL;
 
-	ptr[0] = __alloc_percpu(mibsize, mibalign);
+	ptr[0] = __alloc_percpu(mibsize);
 	if (!ptr[0])
 		goto err0;
 
-	ptr[1] = __alloc_percpu(mibsize, mibalign);
+	ptr[1] = __alloc_percpu(mibsize);
 	if (!ptr[1])
 		goto err1;
 
-- 
cgit v1.2.2


From 85289f98ddc13f6cea82c59d6ff78f9d205dfccc Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Sun, 8 Jan 2006 01:00:36 -0800
Subject: [PATCH] slab: extract slabinfo header printing to separate function

This patch extracts slabinfo header printing to a separate function
print_slabinfo_header() to make s_start() more readable.

Signed-off-by: Matthew Dobson <colpatch@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 45 +++++++++++++++++++++++++--------------------
 1 file changed, 25 insertions(+), 20 deletions(-)

diff --git a/mm/slab.c b/mm/slab.c
index eb70fddf20..3d3b5a4685 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3364,32 +3364,37 @@ next:
 
 #ifdef CONFIG_PROC_FS
 
-static void *s_start(struct seq_file *m, loff_t *pos)
+static void print_slabinfo_header(struct seq_file *m)
 {
-	loff_t n = *pos;
-	struct list_head *p;
-
-	down(&cache_chain_sem);
-	if (!n) {
-		/*
-		 * Output format version, so at least we can change it
-		 * without _too_ many complaints.
-		 */
+	/*
+	 * Output format version, so at least we can change it
+	 * without _too_ many complaints.
+	 */
 #if STATS
-		seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
+	seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
 #else
-		seq_puts(m, "slabinfo - version: 2.1\n");
+	seq_puts(m, "slabinfo - version: 2.1\n");
 #endif
-		seq_puts(m, "# name            <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab>");
-		seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
-		seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
+	seq_puts(m, "# name            <active_objs> <num_objs> <objsize> "
+		 "<objperslab> <pagesperslab>");
+	seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
+	seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
 #if STATS
-		seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped>"
-				" <error> <maxfreeable> <nodeallocs> <remotefrees>");
-		seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
+	seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
+		 "<error> <maxfreeable> <nodeallocs> <remotefrees>");
+	seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
 #endif
-		seq_putc(m, '\n');
-	}
+	seq_putc(m, '\n');
+}
+
+static void *s_start(struct seq_file *m, loff_t *pos)
+{
+	loff_t n = *pos;
+	struct list_head *p;
+
+	down(&cache_chain_sem);
+	if (!n)
+		print_slabinfo_header(m);
 	p = cache_chain.next;
 	while (n--) {
 		p = p->next;
-- 
cgit v1.2.2


From 4d268eba1187ef66844a6a33b9431e5d0dadd4ad Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Sun, 8 Jan 2006 01:00:36 -0800
Subject: [PATCH] slab: extract slab order calculation to separate function

This patch moves the ugly loop that determines the 'optimal' size (page order)
of cache slabs from kmem_cache_create() to a separate function and cleans it
up a bit.

Thanks to Matthew Wilcox for the help with this patch.

Signed-off-by: Matthew Dobson <colpatch@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 89 +++++++++++++++++++++++++++++++++++----------------------------
 1 file changed, 49 insertions(+), 40 deletions(-)

diff --git a/mm/slab.c b/mm/slab.c
index 3d3b5a4685..2551b1eead 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1473,6 +1473,53 @@ static inline void set_up_list3s(kmem_cache_t *cachep, int index)
 	}
 }
 
+/**
+ * calculate_slab_order - calculate size (page order) of slabs and the number
+ *                        of objects per slab.
+ *
+ * This could be made much more intelligent.  For now, try to avoid using
+ * high order pages for slabs.  When the gfp() functions are more friendly
+ * towards high-order requests, this should be changed.
+ */
+static inline size_t calculate_slab_order(kmem_cache_t *cachep, size_t size,
+					  size_t align, gfp_t flags)
+{
+	size_t left_over = 0;
+
+	for ( ; ; cachep->gfporder++) {
+		unsigned int num;
+		size_t remainder;
+
+		if (cachep->gfporder > MAX_GFP_ORDER) {
+			cachep->num = 0;
+			break;
+		}
+
+		cache_estimate(cachep->gfporder, size, align, flags,
+			       &remainder, &num);
+		if (!num)
+			continue;
+		/* More than offslab_limit objects will cause problems */
+		if (flags & CFLGS_OFF_SLAB && cachep->num > offslab_limit)
+			break;
+
+		cachep->num = num;
+		left_over = remainder;
+
+		/*
+		 * Large number of objects is good, but very large slabs are
+		 * currently bad for the gfp()s.
+		 */
+		if (cachep->gfporder >= slab_break_gfp_order)
+			break;
+
+		if ((left_over * 8) <= (PAGE_SIZE << cachep->gfporder))
+			/* Acceptable internal fragmentation */
+			break;
+	}
+	return left_over;
+}
+
 /**
  * kmem_cache_create - Create a cache.
  * @name: A string which is used in /proc/slabinfo to identify this cache.
@@ -1682,46 +1729,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 		cachep->gfporder = 0;
 		cache_estimate(cachep->gfporder, size, align, flags,
 					&left_over, &cachep->num);
-	} else {
-		/*
-		 * Calculate size (in pages) of slabs, and the num of objs per
-		 * slab.  This could be made much more intelligent.  For now,
-		 * try to avoid using high page-orders for slabs.  When the
-		 * gfp() funcs are more friendly towards high-order requests,
-		 * this should be changed.
-		 */
-		do {
-			unsigned int break_flag = 0;
-cal_wastage:
-			cache_estimate(cachep->gfporder, size, align, flags,
-						&left_over, &cachep->num);
-			if (break_flag)
-				break;
-			if (cachep->gfporder >= MAX_GFP_ORDER)
-				break;
-			if (!cachep->num)
-				goto next;
-			if (flags & CFLGS_OFF_SLAB &&
-					cachep->num > offslab_limit) {
-				/* This num of objs will cause problems. */
-				cachep->gfporder--;
-				break_flag++;
-				goto cal_wastage;
-			}
-
-			/*
-			 * Large num of objs is good, but v. large slabs are
-			 * currently bad for the gfp()s.
-			 */
-			if (cachep->gfporder >= slab_break_gfp_order)
-				break;
-
-			if ((left_over*8) <= (PAGE_SIZE<<cachep->gfporder))
-				break;	/* Acceptable internal fragmentation. */
-next:
-			cachep->gfporder++;
-		} while (1);
-	}
+	} else
+		left_over = calculate_slab_order(cachep, size, align, flags);
 
 	if (!cachep->num) {
 		printk("kmem_cache_create: couldn't create cache %s.\n", name);
-- 
cgit v1.2.2


From b28a02de8c70d41d6b6ba8911e83ed3ccf2e13f8 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Sun, 8 Jan 2006 01:00:37 -0800
Subject: [PATCH] slab: fix code formatting

The slab allocator code is inconsistent in coding style and messy.  For this
patch, I ran Lindent for mm/slab.c and fixed up goofs by hand.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 964 ++++++++++++++++++++++++++++++++------------------------------
 1 file changed, 500 insertions(+), 464 deletions(-)

diff --git a/mm/slab.c b/mm/slab.c
index 2551b1eead..f71d8be2f4 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -130,7 +130,6 @@
 #define	FORCED_DEBUG	0
 #endif
 
-
 /* Shouldn't this be in a header file somewhere? */
 #define	BYTES_PER_WORD		sizeof(void *)
 
@@ -217,12 +216,12 @@ static unsigned long offslab_limit;
  * Slabs are chained into three list: fully used, partial, fully free slabs.
  */
 struct slab {
-	struct list_head	list;
-	unsigned long		colouroff;
-	void			*s_mem;		/* including colour offset */
-	unsigned int		inuse;		/* num of objs active in slab */
-	kmem_bufctl_t		free;
-	unsigned short          nodeid;
+	struct list_head list;
+	unsigned long colouroff;
+	void *s_mem;		/* including colour offset */
+	unsigned int inuse;	/* num of objs active in slab */
+	kmem_bufctl_t free;
+	unsigned short nodeid;
 };
 
 /*
@@ -242,9 +241,9 @@ struct slab {
  * We assume struct slab_rcu can overlay struct slab when destroying.
  */
 struct slab_rcu {
-	struct rcu_head		head;
-	kmem_cache_t		*cachep;
-	void			*addr;
+	struct rcu_head head;
+	kmem_cache_t *cachep;
+	void *addr;
 };
 
 /*
@@ -279,23 +278,23 @@ struct array_cache {
 #define BOOT_CPUCACHE_ENTRIES	1
 struct arraycache_init {
 	struct array_cache cache;
-	void * entries[BOOT_CPUCACHE_ENTRIES];
+	void *entries[BOOT_CPUCACHE_ENTRIES];
 };
 
 /*
  * The slab lists for all objects.
  */
 struct kmem_list3 {
-	struct list_head	slabs_partial;	/* partial list first, better asm code */
-	struct list_head	slabs_full;
-	struct list_head	slabs_free;
-	unsigned long	free_objects;
-	unsigned long	next_reap;
-	int		free_touched;
-	unsigned int 	free_limit;
-	spinlock_t      list_lock;
-	struct array_cache	*shared;	/* shared per node */
-	struct array_cache	**alien;	/* on other nodes */
+	struct list_head slabs_partial;	/* partial list first, better asm code */
+	struct list_head slabs_full;
+	struct list_head slabs_free;
+	unsigned long free_objects;
+	unsigned long next_reap;
+	int free_touched;
+	unsigned int free_limit;
+	spinlock_t list_lock;
+	struct array_cache *shared;	/* shared per node */
+	struct array_cache **alien;	/* on other nodes */
 };
 
 /*
@@ -367,63 +366,63 @@ static inline void kmem_list3_init(struct kmem_list3 *parent)
  *
  * manages a cache.
  */
-	
+
 struct kmem_cache {
 /* 1) per-cpu data, touched during every alloc/free */
-	struct array_cache	*array[NR_CPUS];
-	unsigned int		batchcount;
-	unsigned int		limit;
-	unsigned int 		shared;
-	unsigned int		objsize;
+	struct array_cache *array[NR_CPUS];
+	unsigned int batchcount;
+	unsigned int limit;
+	unsigned int shared;
+	unsigned int objsize;
 /* 2) touched by every alloc & free from the backend */
-	struct kmem_list3	*nodelists[MAX_NUMNODES];
-	unsigned int	 	flags;	/* constant flags */
-	unsigned int		num;	/* # of objs per slab */
-	spinlock_t		spinlock;
+	struct kmem_list3 *nodelists[MAX_NUMNODES];
+	unsigned int flags;	/* constant flags */
+	unsigned int num;	/* # of objs per slab */
+	spinlock_t spinlock;
 
 /* 3) cache_grow/shrink */
 	/* order of pgs per slab (2^n) */
-	unsigned int		gfporder;
+	unsigned int gfporder;
 
 	/* force GFP flags, e.g. GFP_DMA */
-	gfp_t			gfpflags;
+	gfp_t gfpflags;
 
-	size_t			colour;		/* cache colouring range */
-	unsigned int		colour_off;	/* colour offset */
-	unsigned int		colour_next;	/* cache colouring */
-	kmem_cache_t		*slabp_cache;
-	unsigned int		slab_size;
-	unsigned int		dflags;		/* dynamic flags */
+	size_t colour;		/* cache colouring range */
+	unsigned int colour_off;	/* colour offset */
+	unsigned int colour_next;	/* cache colouring */
+	kmem_cache_t *slabp_cache;
+	unsigned int slab_size;
+	unsigned int dflags;	/* dynamic flags */
 
 	/* constructor func */
-	void (*ctor)(void *, kmem_cache_t *, unsigned long);
+	void (*ctor) (void *, kmem_cache_t *, unsigned long);
 
 	/* de-constructor func */
-	void (*dtor)(void *, kmem_cache_t *, unsigned long);
+	void (*dtor) (void *, kmem_cache_t *, unsigned long);
 
 /* 4) cache creation/removal */
-	const char		*name;
-	struct list_head	next;
+	const char *name;
+	struct list_head next;
 
 /* 5) statistics */
 #if STATS
-	unsigned long		num_active;
-	unsigned long		num_allocations;
-	unsigned long		high_mark;
-	unsigned long		grown;
-	unsigned long		reaped;
-	unsigned long 		errors;
-	unsigned long		max_freeable;
-	unsigned long		node_allocs;
-	unsigned long		node_frees;
-	atomic_t		allochit;
-	atomic_t		allocmiss;
-	atomic_t		freehit;
-	atomic_t		freemiss;
+	unsigned long num_active;
+	unsigned long num_allocations;
+	unsigned long high_mark;
+	unsigned long grown;
+	unsigned long reaped;
+	unsigned long errors;
+	unsigned long max_freeable;
+	unsigned long node_allocs;
+	unsigned long node_frees;
+	atomic_t allochit;
+	atomic_t allocmiss;
+	atomic_t freehit;
+	atomic_t freemiss;
 #endif
 #if DEBUG
-	int			dbghead;
-	int			reallen;
+	int dbghead;
+	int reallen;
 #endif
 };
 
@@ -523,14 +522,15 @@ static unsigned long *dbg_redzone2(kmem_cache_t *cachep, void *objp)
 {
 	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
 	if (cachep->flags & SLAB_STORE_USER)
-		return (unsigned long*) (objp+cachep->objsize-2*BYTES_PER_WORD);
-	return (unsigned long*) (objp+cachep->objsize-BYTES_PER_WORD);
+		return (unsigned long *)(objp + cachep->objsize -
+					 2 * BYTES_PER_WORD);
+	return (unsigned long *)(objp + cachep->objsize - BYTES_PER_WORD);
 }
 
 static void **dbg_userword(kmem_cache_t *cachep, void *objp)
 {
 	BUG_ON(!(cachep->flags & SLAB_STORE_USER));
-	return (void**)(objp+cachep->objsize-BYTES_PER_WORD);
+	return (void **)(objp + cachep->objsize - BYTES_PER_WORD);
 }
 
 #else
@@ -607,31 +607,31 @@ struct cache_names {
 static struct cache_names __initdata cache_names[] = {
 #define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },
 #include <linux/kmalloc_sizes.h>
-	{ NULL, }
+	{NULL,}
 #undef CACHE
 };
 
 static struct arraycache_init initarray_cache __initdata =
-	{ { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
+    { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
 static struct arraycache_init initarray_generic =
-	{ { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
+    { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
 
 /* internal cache of cache description objs */
 static kmem_cache_t cache_cache = {
-	.batchcount	= 1,
-	.limit		= BOOT_CPUCACHE_ENTRIES,
-	.shared		= 1,
-	.objsize	= sizeof(kmem_cache_t),
-	.flags		= SLAB_NO_REAP,
-	.spinlock	= SPIN_LOCK_UNLOCKED,
-	.name		= "kmem_cache",
+	.batchcount = 1,
+	.limit = BOOT_CPUCACHE_ENTRIES,
+	.shared = 1,
+	.objsize = sizeof(kmem_cache_t),
+	.flags = SLAB_NO_REAP,
+	.spinlock = SPIN_LOCK_UNLOCKED,
+	.name = "kmem_cache",
 #if DEBUG
-	.reallen	= sizeof(kmem_cache_t),
+	.reallen = sizeof(kmem_cache_t),
 #endif
 };
 
 /* Guard access to the cache-chain. */
-static struct semaphore	cache_chain_sem;
+static struct semaphore cache_chain_sem;
 static struct list_head cache_chain;
 
 /*
@@ -655,9 +655,9 @@ static enum {
 
 static DEFINE_PER_CPU(struct work_struct, reap_work);
 
-static void free_block(kmem_cache_t* cachep, void** objpp, int len, int node);
-static void enable_cpucache (kmem_cache_t *cachep);
-static void cache_reap (void *unused);
+static void free_block(kmem_cache_t *cachep, void **objpp, int len, int node);
+static void enable_cpucache(kmem_cache_t *cachep);
+static void cache_reap(void *unused);
 static int __node_shrink(kmem_cache_t *cachep, int node);
 
 static inline struct array_cache *ac_data(kmem_cache_t *cachep)
@@ -671,9 +671,9 @@ static inline kmem_cache_t *__find_general_cachep(size_t size, gfp_t gfpflags)
 
 #if DEBUG
 	/* This happens if someone tries to call
- 	* kmem_cache_create(), or __kmalloc(), before
- 	* the generic caches are initialized.
- 	*/
+	 * kmem_cache_create(), or __kmalloc(), before
+	 * the generic caches are initialized.
+	 */
 	BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);
 #endif
 	while (size > csizep->cs_size)
@@ -697,10 +697,10 @@ EXPORT_SYMBOL(kmem_find_general_cachep);
 
 /* Cal the num objs, wastage, and bytes left over for a given slab size. */
 static void cache_estimate(unsigned long gfporder, size_t size, size_t align,
-		 int flags, size_t *left_over, unsigned int *num)
+			   int flags, size_t *left_over, unsigned int *num)
 {
 	int i;
-	size_t wastage = PAGE_SIZE<<gfporder;
+	size_t wastage = PAGE_SIZE << gfporder;
 	size_t extra = 0;
 	size_t base = 0;
 
@@ -709,7 +709,7 @@ static void cache_estimate(unsigned long gfporder, size_t size, size_t align,
 		extra = sizeof(kmem_bufctl_t);
 	}
 	i = 0;
-	while (i*size + ALIGN(base+i*extra, align) <= wastage)
+	while (i * size + ALIGN(base + i * extra, align) <= wastage)
 		i++;
 	if (i > 0)
 		i--;
@@ -718,8 +718,8 @@ static void cache_estimate(unsigned long gfporder, size_t size, size_t align,
 		i = SLAB_LIMIT;
 
 	*num = i;
-	wastage -= i*size;
-	wastage -= ALIGN(base+i*extra, align);
+	wastage -= i * size;
+	wastage -= ALIGN(base + i * extra, align);
 	*left_over = wastage;
 }
 
@@ -728,7 +728,7 @@ static void cache_estimate(unsigned long gfporder, size_t size, size_t align,
 static void __slab_error(const char *function, kmem_cache_t *cachep, char *msg)
 {
 	printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
-		function, cachep->name, msg);
+	       function, cachep->name, msg);
 	dump_stack();
 }
 
@@ -755,9 +755,9 @@ static void __devinit start_cpu_timer(int cpu)
 }
 
 static struct array_cache *alloc_arraycache(int node, int entries,
-						int batchcount)
+					    int batchcount)
 {
-	int memsize = sizeof(void*)*entries+sizeof(struct array_cache);
+	int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
 	struct array_cache *nc = NULL;
 
 	nc = kmalloc_node(memsize, GFP_KERNEL, node);
@@ -775,7 +775,7 @@ static struct array_cache *alloc_arraycache(int node, int entries,
 static inline struct array_cache **alloc_alien_cache(int node, int limit)
 {
 	struct array_cache **ac_ptr;
-	int memsize = sizeof(void*)*MAX_NUMNODES;
+	int memsize = sizeof(void *) * MAX_NUMNODES;
 	int i;
 
 	if (limit > 1)
@@ -789,7 +789,7 @@ static inline struct array_cache **alloc_alien_cache(int node, int limit)
 			}
 			ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d);
 			if (!ac_ptr[i]) {
-				for (i--; i <=0; i--)
+				for (i--; i <= 0; i--)
 					kfree(ac_ptr[i]);
 				kfree(ac_ptr);
 				return NULL;
@@ -807,12 +807,13 @@ static inline void free_alien_cache(struct array_cache **ac_ptr)
 		return;
 
 	for_each_node(i)
-		kfree(ac_ptr[i]);
+	    kfree(ac_ptr[i]);
 
 	kfree(ac_ptr);
 }
 
-static inline void __drain_alien_cache(kmem_cache_t *cachep, struct array_cache *ac, int node)
+static inline void __drain_alien_cache(kmem_cache_t *cachep,
+				       struct array_cache *ac, int node)
 {
 	struct kmem_list3 *rl3 = cachep->nodelists[node];
 
@@ -826,7 +827,7 @@ static inline void __drain_alien_cache(kmem_cache_t *cachep, struct array_cache
 
 static void drain_alien_cache(kmem_cache_t *cachep, struct kmem_list3 *l3)
 {
-	int i=0;
+	int i = 0;
 	struct array_cache *ac;
 	unsigned long flags;
 
@@ -846,10 +847,10 @@ static void drain_alien_cache(kmem_cache_t *cachep, struct kmem_list3 *l3)
 #endif
 
 static int __devinit cpuup_callback(struct notifier_block *nfb,
-				  unsigned long action, void *hcpu)
+				    unsigned long action, void *hcpu)
 {
 	long cpu = (long)hcpu;
-	kmem_cache_t* cachep;
+	kmem_cache_t *cachep;
 	struct kmem_list3 *l3 = NULL;
 	int node = cpu_to_node(cpu);
 	int memsize = sizeof(struct kmem_list3);
@@ -871,27 +872,27 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
 			 */
 			if (!cachep->nodelists[node]) {
 				if (!(l3 = kmalloc_node(memsize,
-						GFP_KERNEL, node)))
+							GFP_KERNEL, node)))
 					goto bad;
 				kmem_list3_init(l3);
 				l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
-				  ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
+				    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
 
 				cachep->nodelists[node] = l3;
 			}
 
 			spin_lock_irq(&cachep->nodelists[node]->list_lock);
 			cachep->nodelists[node]->free_limit =
-				(1 + nr_cpus_node(node)) *
-				cachep->batchcount + cachep->num;
+			    (1 + nr_cpus_node(node)) *
+			    cachep->batchcount + cachep->num;
 			spin_unlock_irq(&cachep->nodelists[node]->list_lock);
 		}
 
 		/* Now we can go ahead with allocating the shared array's
-		  & array cache's */
+		   & array cache's */
 		list_for_each_entry(cachep, &cache_chain, next) {
 			nc = alloc_arraycache(node, cachep->limit,
-					cachep->batchcount);
+					      cachep->batchcount);
 			if (!nc)
 				goto bad;
 			cachep->array[cpu] = nc;
@@ -900,12 +901,13 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
 			BUG_ON(!l3);
 			if (!l3->shared) {
 				if (!(nc = alloc_arraycache(node,
-					cachep->shared*cachep->batchcount,
-					0xbaadf00d)))
-					goto  bad;
+							    cachep->shared *
+							    cachep->batchcount,
+							    0xbaadf00d)))
+					goto bad;
 
 				/* we are serialised from CPU_DEAD or
-				  CPU_UP_CANCELLED by the cpucontrol lock */
+				   CPU_UP_CANCELLED by the cpucontrol lock */
 				l3->shared = nc;
 			}
 		}
@@ -942,13 +944,13 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
 				free_block(cachep, nc->entry, nc->avail, node);
 
 			if (!cpus_empty(mask)) {
-                                spin_unlock(&l3->list_lock);
-                                goto unlock_cache;
-                        }
+				spin_unlock(&l3->list_lock);
+				goto unlock_cache;
+			}
 
 			if (l3->shared) {
 				free_block(cachep, l3->shared->entry,
-						l3->shared->avail, node);
+					   l3->shared->avail, node);
 				kfree(l3->shared);
 				l3->shared = NULL;
 			}
@@ -966,7 +968,7 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
 			} else {
 				spin_unlock(&l3->list_lock);
 			}
-unlock_cache:
+		      unlock_cache:
 			spin_unlock_irq(&cachep->spinlock);
 			kfree(nc);
 		}
@@ -975,7 +977,7 @@ unlock_cache:
 #endif
 	}
 	return NOTIFY_OK;
-bad:
+      bad:
 	up(&cache_chain_sem);
 	return NOTIFY_BAD;
 }
@@ -985,8 +987,7 @@ static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 };
 /*
  * swap the static kmem_list3 with kmalloced memory
  */
-static void init_list(kmem_cache_t *cachep, struct kmem_list3 *list,
-		int nodeid)
+static void init_list(kmem_cache_t *cachep, struct kmem_list3 *list, int nodeid)
 {
 	struct kmem_list3 *ptr;
 
@@ -1055,14 +1056,14 @@ void __init kmem_cache_init(void)
 	cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size());
 
 	cache_estimate(0, cache_cache.objsize, cache_line_size(), 0,
-				&left_over, &cache_cache.num);
+		       &left_over, &cache_cache.num);
 	if (!cache_cache.num)
 		BUG();
 
-	cache_cache.colour = left_over/cache_cache.colour_off;
+	cache_cache.colour = left_over / cache_cache.colour_off;
 	cache_cache.colour_next = 0;
-	cache_cache.slab_size = ALIGN(cache_cache.num*sizeof(kmem_bufctl_t) +
-				sizeof(struct slab), cache_line_size());
+	cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
+				      sizeof(struct slab), cache_line_size());
 
 	/* 2+3) create the kmalloc caches */
 	sizes = malloc_sizes;
@@ -1074,14 +1075,18 @@ void __init kmem_cache_init(void)
 	 */
 
 	sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
-				sizes[INDEX_AC].cs_size, ARCH_KMALLOC_MINALIGN,
-				(ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);
+						      sizes[INDEX_AC].cs_size,
+						      ARCH_KMALLOC_MINALIGN,
+						      (ARCH_KMALLOC_FLAGS |
+						       SLAB_PANIC), NULL, NULL);
 
 	if (INDEX_AC != INDEX_L3)
 		sizes[INDEX_L3].cs_cachep =
-			kmem_cache_create(names[INDEX_L3].name,
-				sizes[INDEX_L3].cs_size, ARCH_KMALLOC_MINALIGN,
-				(ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);
+		    kmem_cache_create(names[INDEX_L3].name,
+				      sizes[INDEX_L3].cs_size,
+				      ARCH_KMALLOC_MINALIGN,
+				      (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL,
+				      NULL);
 
 	while (sizes->cs_size != ULONG_MAX) {
 		/*
@@ -1091,35 +1096,41 @@ void __init kmem_cache_init(void)
 		 * Note for systems short on memory removing the alignment will
 		 * allow tighter packing of the smaller caches.
 		 */
-		if(!sizes->cs_cachep)
+		if (!sizes->cs_cachep)
 			sizes->cs_cachep = kmem_cache_create(names->name,
-				sizes->cs_size, ARCH_KMALLOC_MINALIGN,
-				(ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);
+							     sizes->cs_size,
+							     ARCH_KMALLOC_MINALIGN,
+							     (ARCH_KMALLOC_FLAGS
+							      | SLAB_PANIC),
+							     NULL, NULL);
 
 		/* Inc off-slab bufctl limit until the ceiling is hit. */
 		if (!(OFF_SLAB(sizes->cs_cachep))) {
-			offslab_limit = sizes->cs_size-sizeof(struct slab);
+			offslab_limit = sizes->cs_size - sizeof(struct slab);
 			offslab_limit /= sizeof(kmem_bufctl_t);
 		}
 
 		sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
-			sizes->cs_size, ARCH_KMALLOC_MINALIGN,
-			(ARCH_KMALLOC_FLAGS | SLAB_CACHE_DMA | SLAB_PANIC),
-			NULL, NULL);
+							sizes->cs_size,
+							ARCH_KMALLOC_MINALIGN,
+							(ARCH_KMALLOC_FLAGS |
+							 SLAB_CACHE_DMA |
+							 SLAB_PANIC), NULL,
+							NULL);
 
 		sizes++;
 		names++;
 	}
 	/* 4) Replace the bootstrap head arrays */
 	{
-		void * ptr;
+		void *ptr;
 
 		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
 
 		local_irq_disable();
 		BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache);
 		memcpy(ptr, ac_data(&cache_cache),
-				sizeof(struct arraycache_init));
+		       sizeof(struct arraycache_init));
 		cache_cache.array[smp_processor_id()] = ptr;
 		local_irq_enable();
 
@@ -1127,11 +1138,11 @@ void __init kmem_cache_init(void)
 
 		local_irq_disable();
 		BUG_ON(ac_data(malloc_sizes[INDEX_AC].cs_cachep)
-				!= &initarray_generic.cache);
+		       != &initarray_generic.cache);
 		memcpy(ptr, ac_data(malloc_sizes[INDEX_AC].cs_cachep),
-				sizeof(struct arraycache_init));
+		       sizeof(struct arraycache_init));
 		malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
-						ptr;
+		    ptr;
 		local_irq_enable();
 	}
 	/* 5) Replace the bootstrap kmem_list3's */
@@ -1139,16 +1150,16 @@ void __init kmem_cache_init(void)
 		int node;
 		/* Replace the static kmem_list3 structures for the boot cpu */
 		init_list(&cache_cache, &initkmem_list3[CACHE_CACHE],
-				numa_node_id());
+			  numa_node_id());
 
 		for_each_online_node(node) {
 			init_list(malloc_sizes[INDEX_AC].cs_cachep,
-					&initkmem_list3[SIZE_AC+node], node);
+				  &initkmem_list3[SIZE_AC + node], node);
 
 			if (INDEX_AC != INDEX_L3) {
 				init_list(malloc_sizes[INDEX_L3].cs_cachep,
-						&initkmem_list3[SIZE_L3+node],
-						node);
+					  &initkmem_list3[SIZE_L3 + node],
+					  node);
 			}
 		}
 	}
@@ -1158,7 +1169,7 @@ void __init kmem_cache_init(void)
 		kmem_cache_t *cachep;
 		down(&cache_chain_sem);
 		list_for_each_entry(cachep, &cache_chain, next)
-			enable_cpucache(cachep);
+		    enable_cpucache(cachep);
 		up(&cache_chain_sem);
 	}
 
@@ -1184,7 +1195,7 @@ static int __init cpucache_init(void)
 	 * pages to gfp.
 	 */
 	for_each_online_cpu(cpu)
-		start_cpu_timer(cpu);
+	    start_cpu_timer(cpu);
 
 	return 0;
 }
@@ -1226,7 +1237,7 @@ static void *kmem_getpages(kmem_cache_t *cachep, gfp_t flags, int nodeid)
  */
 static void kmem_freepages(kmem_cache_t *cachep, void *addr)
 {
-	unsigned long i = (1<<cachep->gfporder);
+	unsigned long i = (1 << cachep->gfporder);
 	struct page *page = virt_to_page(addr);
 	const unsigned long nr_freed = i;
 
@@ -1239,13 +1250,13 @@ static void kmem_freepages(kmem_cache_t *cachep, void *addr)
 	if (current->reclaim_state)
 		current->reclaim_state->reclaimed_slab += nr_freed;
 	free_pages((unsigned long)addr, cachep->gfporder);
-	if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 
-		atomic_sub(1<<cachep->gfporder, &slab_reclaim_pages);
+	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
+		atomic_sub(1 << cachep->gfporder, &slab_reclaim_pages);
 }
 
 static void kmem_rcu_free(struct rcu_head *head)
 {
-	struct slab_rcu *slab_rcu = (struct slab_rcu *) head;
+	struct slab_rcu *slab_rcu = (struct slab_rcu *)head;
 	kmem_cache_t *cachep = slab_rcu->cachep;
 
 	kmem_freepages(cachep, slab_rcu->addr);
@@ -1257,19 +1268,19 @@ static void kmem_rcu_free(struct rcu_head *head)
 
 #ifdef CONFIG_DEBUG_PAGEALLOC
 static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr,
-				unsigned long caller)
+			    unsigned long caller)
 {
 	int size = obj_reallen(cachep);
 
-	addr = (unsigned long *)&((char*)addr)[obj_dbghead(cachep)];
+	addr = (unsigned long *)&((char *)addr)[obj_dbghead(cachep)];
 
-	if (size < 5*sizeof(unsigned long))
+	if (size < 5 * sizeof(unsigned long))
 		return;
 
-	*addr++=0x12345678;
-	*addr++=caller;
-	*addr++=smp_processor_id();
-	size -= 3*sizeof(unsigned long);
+	*addr++ = 0x12345678;
+	*addr++ = caller;
+	*addr++ = smp_processor_id();
+	size -= 3 * sizeof(unsigned long);
 	{
 		unsigned long *sptr = &caller;
 		unsigned long svalue;
@@ -1277,7 +1288,7 @@ static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr,
 		while (!kstack_end(sptr)) {
 			svalue = *sptr++;
 			if (kernel_text_address(svalue)) {
-				*addr++=svalue;
+				*addr++ = svalue;
 				size -= sizeof(unsigned long);
 				if (size <= sizeof(unsigned long))
 					break;
@@ -1285,25 +1296,25 @@ static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr,
 		}
 
 	}
-	*addr++=0x87654321;
+	*addr++ = 0x87654321;
 }
 #endif
 
 static void poison_obj(kmem_cache_t *cachep, void *addr, unsigned char val)
 {
 	int size = obj_reallen(cachep);
-	addr = &((char*)addr)[obj_dbghead(cachep)];
+	addr = &((char *)addr)[obj_dbghead(cachep)];
 
 	memset(addr, val, size);
-	*(unsigned char *)(addr+size-1) = POISON_END;
+	*(unsigned char *)(addr + size - 1) = POISON_END;
 }
 
 static void dump_line(char *data, int offset, int limit)
 {
 	int i;
 	printk(KERN_ERR "%03x:", offset);
-	for (i=0;i<limit;i++) {
-		printk(" %02x", (unsigned char)data[offset+i]);
+	for (i = 0; i < limit; i++) {
+		printk(" %02x", (unsigned char)data[offset + i]);
 	}
 	printk("\n");
 }
@@ -1318,24 +1329,24 @@ static void print_objinfo(kmem_cache_t *cachep, void *objp, int lines)
 
 	if (cachep->flags & SLAB_RED_ZONE) {
 		printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n",
-			*dbg_redzone1(cachep, objp),
-			*dbg_redzone2(cachep, objp));
+		       *dbg_redzone1(cachep, objp),
+		       *dbg_redzone2(cachep, objp));
 	}
 
 	if (cachep->flags & SLAB_STORE_USER) {
 		printk(KERN_ERR "Last user: [<%p>]",
-				*dbg_userword(cachep, objp));
+		       *dbg_userword(cachep, objp));
 		print_symbol("(%s)",
-				(unsigned long)*dbg_userword(cachep, objp));
+			     (unsigned long)*dbg_userword(cachep, objp));
 		printk("\n");
 	}
-	realobj = (char*)objp+obj_dbghead(cachep);
+	realobj = (char *)objp + obj_dbghead(cachep);
 	size = obj_reallen(cachep);
-	for (i=0; i<size && lines;i+=16, lines--) {
+	for (i = 0; i < size && lines; i += 16, lines--) {
 		int limit;
 		limit = 16;
-		if (i+limit > size)
-			limit = size-i;
+		if (i + limit > size)
+			limit = size - i;
 		dump_line(realobj, i, limit);
 	}
 }
@@ -1346,27 +1357,28 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp)
 	int size, i;
 	int lines = 0;
 
-	realobj = (char*)objp+obj_dbghead(cachep);
+	realobj = (char *)objp + obj_dbghead(cachep);
 	size = obj_reallen(cachep);
 
-	for (i=0;i<size;i++) {
+	for (i = 0; i < size; i++) {
 		char exp = POISON_FREE;
-		if (i == size-1)
+		if (i == size - 1)
 			exp = POISON_END;
 		if (realobj[i] != exp) {
 			int limit;
 			/* Mismatch ! */
 			/* Print header */
 			if (lines == 0) {
-				printk(KERN_ERR "Slab corruption: start=%p, len=%d\n",
-						realobj, size);
+				printk(KERN_ERR
+				       "Slab corruption: start=%p, len=%d\n",
+				       realobj, size);
 				print_objinfo(cachep, objp, 0);
 			}
 			/* Hexdump the affected line */
-			i = (i/16)*16;
+			i = (i / 16) * 16;
 			limit = 16;
-			if (i+limit > size)
-				limit = size-i;
+			if (i + limit > size)
+				limit = size - i;
 			dump_line(realobj, i, limit);
 			i += 16;
 			lines++;
@@ -1382,19 +1394,19 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp)
 		struct slab *slabp = page_get_slab(virt_to_page(objp));
 		int objnr;
 
-		objnr = (objp-slabp->s_mem)/cachep->objsize;
+		objnr = (objp - slabp->s_mem) / cachep->objsize;
 		if (objnr) {
-			objp = slabp->s_mem+(objnr-1)*cachep->objsize;
-			realobj = (char*)objp+obj_dbghead(cachep);
+			objp = slabp->s_mem + (objnr - 1) * cachep->objsize;
+			realobj = (char *)objp + obj_dbghead(cachep);
 			printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
-						realobj, size);
+			       realobj, size);
 			print_objinfo(cachep, objp, 2);
 		}
-		if (objnr+1 < cachep->num) {
-			objp = slabp->s_mem+(objnr+1)*cachep->objsize;
-			realobj = (char*)objp+obj_dbghead(cachep);
+		if (objnr + 1 < cachep->num) {
+			objp = slabp->s_mem + (objnr + 1) * cachep->objsize;
+			realobj = (char *)objp + obj_dbghead(cachep);
 			printk(KERN_ERR "Next obj: start=%p, len=%d\n",
-						realobj, size);
+			       realobj, size);
 			print_objinfo(cachep, objp, 2);
 		}
 	}
@@ -1405,7 +1417,7 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp)
  * Before calling the slab must have been unlinked from the cache.
  * The cache-lock is not held/needed.
  */
-static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
+static void slab_destroy(kmem_cache_t *cachep, struct slab *slabp)
 {
 	void *addr = slabp->s_mem - slabp->colouroff;
 
@@ -1416,8 +1428,11 @@ static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
 
 		if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
-			if ((cachep->objsize%PAGE_SIZE)==0 && OFF_SLAB(cachep))
-				kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE,1);
+			if ((cachep->objsize % PAGE_SIZE) == 0
+			    && OFF_SLAB(cachep))
+				kernel_map_pages(virt_to_page(objp),
+						 cachep->objsize / PAGE_SIZE,
+						 1);
 			else
 				check_poison_obj(cachep, objp);
 #else
@@ -1427,20 +1442,20 @@ static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
 		if (cachep->flags & SLAB_RED_ZONE) {
 			if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
 				slab_error(cachep, "start of a freed object "
-							"was overwritten");
+					   "was overwritten");
 			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
 				slab_error(cachep, "end of a freed object "
-							"was overwritten");
+					   "was overwritten");
 		}
 		if (cachep->dtor && !(cachep->flags & SLAB_POISON))
-			(cachep->dtor)(objp+obj_dbghead(cachep), cachep, 0);
+			(cachep->dtor) (objp + obj_dbghead(cachep), cachep, 0);
 	}
 #else
 	if (cachep->dtor) {
 		int i;
 		for (i = 0; i < cachep->num; i++) {
-			void* objp = slabp->s_mem+cachep->objsize*i;
-			(cachep->dtor)(objp, cachep, 0);
+			void *objp = slabp->s_mem + cachep->objsize * i;
+			(cachep->dtor) (objp, cachep, 0);
 		}
 	}
 #endif
@@ -1448,7 +1463,7 @@ static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
 	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
 		struct slab_rcu *slab_rcu;
 
-		slab_rcu = (struct slab_rcu *) slabp;
+		slab_rcu = (struct slab_rcu *)slabp;
 		slab_rcu->cachep = cachep;
 		slab_rcu->addr = addr;
 		call_rcu(&slab_rcu->head, kmem_rcu_free);
@@ -1466,10 +1481,10 @@ static inline void set_up_list3s(kmem_cache_t *cachep, int index)
 	int node;
 
 	for_each_online_node(node) {
-		cachep->nodelists[node] = &initkmem_list3[index+node];
+		cachep->nodelists[node] = &initkmem_list3[index + node];
 		cachep->nodelists[node]->next_reap = jiffies +
-			REAPTIMEOUT_LIST3 +
-			((unsigned long)cachep)%REAPTIMEOUT_LIST3;
+		    REAPTIMEOUT_LIST3 +
+		    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
 	}
 }
 
@@ -1486,7 +1501,7 @@ static inline size_t calculate_slab_order(kmem_cache_t *cachep, size_t size,
 {
 	size_t left_over = 0;
 
-	for ( ; ; cachep->gfporder++) {
+	for (;; cachep->gfporder++) {
 		unsigned int num;
 		size_t remainder;
 
@@ -1566,14 +1581,13 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	 * Sanity checks... these are all serious usage bugs.
 	 */
 	if ((!name) ||
-		in_interrupt() ||
-		(size < BYTES_PER_WORD) ||
-		(size > (1<<MAX_OBJ_ORDER)*PAGE_SIZE) ||
-		(dtor && !ctor)) {
-			printk(KERN_ERR "%s: Early error in slab %s\n",
-					__FUNCTION__, name);
-			BUG();
-		}
+	    in_interrupt() ||
+	    (size < BYTES_PER_WORD) ||
+	    (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) {
+		printk(KERN_ERR "%s: Early error in slab %s\n",
+		       __FUNCTION__, name);
+		BUG();
+	}
 
 	down(&cache_chain_sem);
 
@@ -1593,11 +1607,11 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 		set_fs(old_fs);
 		if (res) {
 			printk("SLAB: cache with size %d has lost its name\n",
-					pc->objsize);
+			       pc->objsize);
 			continue;
 		}
 
-		if (!strcmp(pc->name,name)) {
+		if (!strcmp(pc->name, name)) {
 			printk("kmem_cache_create: duplicate cache %s\n", name);
 			dump_stack();
 			goto oops;
@@ -1609,10 +1623,9 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	if ((flags & SLAB_DEBUG_INITIAL) && !ctor) {
 		/* No constructor, but inital state check requested */
 		printk(KERN_ERR "%s: No con, but init state check "
-				"requested - %s\n", __FUNCTION__, name);
+		       "requested - %s\n", __FUNCTION__, name);
 		flags &= ~SLAB_DEBUG_INITIAL;
 	}
-
 #if FORCED_DEBUG
 	/*
 	 * Enable redzoning and last user accounting, except for caches with
@@ -1620,8 +1633,9 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	 * above the next power of two: caches with object sizes just above a
 	 * power of two have a significant amount of internal fragmentation.
 	 */
-	if ((size < 4096 || fls(size-1) == fls(size-1+3*BYTES_PER_WORD)))
-		flags |= SLAB_RED_ZONE|SLAB_STORE_USER;
+	if ((size < 4096
+	     || fls(size - 1) == fls(size - 1 + 3 * BYTES_PER_WORD)))
+		flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
 	if (!(flags & SLAB_DESTROY_BY_RCU))
 		flags |= SLAB_POISON;
 #endif
@@ -1642,9 +1656,9 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	 * unaligned accesses for some archs when redzoning is used, and makes
 	 * sure any on-slab bufctl's are also correctly aligned.
 	 */
-	if (size & (BYTES_PER_WORD-1)) {
-		size += (BYTES_PER_WORD-1);
-		size &= ~(BYTES_PER_WORD-1);
+	if (size & (BYTES_PER_WORD - 1)) {
+		size += (BYTES_PER_WORD - 1);
+		size &= ~(BYTES_PER_WORD - 1);
 	}
 
 	/* calculate out the final buffer alignment: */
@@ -1655,7 +1669,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 		 * objects into one cacheline.
 		 */
 		ralign = cache_line_size();
-		while (size <= ralign/2)
+		while (size <= ralign / 2)
 			ralign /= 2;
 	} else {
 		ralign = BYTES_PER_WORD;
@@ -1664,13 +1678,13 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	if (ralign < ARCH_SLAB_MINALIGN) {
 		ralign = ARCH_SLAB_MINALIGN;
 		if (ralign > BYTES_PER_WORD)
-			flags &= ~(SLAB_RED_ZONE|SLAB_STORE_USER);
+			flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
 	}
 	/* 3) caller mandated alignment: disables debug if necessary */
 	if (ralign < align) {
 		ralign = align;
 		if (ralign > BYTES_PER_WORD)
-			flags &= ~(SLAB_RED_ZONE|SLAB_STORE_USER);
+			flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
 	}
 	/* 4) Store it. Note that the debug code below can reduce
 	 *    the alignment to BYTES_PER_WORD.
@@ -1692,7 +1706,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 
 		/* add space for red zone words */
 		cachep->dbghead += BYTES_PER_WORD;
-		size += 2*BYTES_PER_WORD;
+		size += 2 * BYTES_PER_WORD;
 	}
 	if (flags & SLAB_STORE_USER) {
 		/* user store requires word alignment and
@@ -1703,7 +1717,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 		size += BYTES_PER_WORD;
 	}
 #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
-	if (size >= malloc_sizes[INDEX_L3+1].cs_size && cachep->reallen > cache_line_size() && size < PAGE_SIZE) {
+	if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
+	    && cachep->reallen > cache_line_size() && size < PAGE_SIZE) {
 		cachep->dbghead += PAGE_SIZE - size;
 		size = PAGE_SIZE;
 	}
@@ -1711,7 +1726,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 #endif
 
 	/* Determine if the slab management is 'on' or 'off' slab. */
-	if (size >= (PAGE_SIZE>>3))
+	if (size >= (PAGE_SIZE >> 3))
 		/*
 		 * Size is large, assume best to place the slab management obj
 		 * off-slab (should allow better packing of objs).
@@ -1728,7 +1743,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 		 */
 		cachep->gfporder = 0;
 		cache_estimate(cachep->gfporder, size, align, flags,
-					&left_over, &cachep->num);
+			       &left_over, &cachep->num);
 	} else
 		left_over = calculate_slab_order(cachep, size, align, flags);
 
@@ -1738,8 +1753,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 		cachep = NULL;
 		goto oops;
 	}
-	slab_size = ALIGN(cachep->num*sizeof(kmem_bufctl_t)
-				+ sizeof(struct slab), align);
+	slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
+			  + sizeof(struct slab), align);
 
 	/*
 	 * If the slab has been placed off-slab, and we have enough space then
@@ -1752,14 +1767,15 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 
 	if (flags & CFLGS_OFF_SLAB) {
 		/* really off slab. No need for manual alignment */
-		slab_size = cachep->num*sizeof(kmem_bufctl_t)+sizeof(struct slab);
+		slab_size =
+		    cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
 	}
 
 	cachep->colour_off = cache_line_size();
 	/* Offset must be a multiple of the alignment. */
 	if (cachep->colour_off < align)
 		cachep->colour_off = align;
-	cachep->colour = left_over/cachep->colour_off;
+	cachep->colour = left_over / cachep->colour_off;
 	cachep->slab_size = slab_size;
 	cachep->flags = flags;
 	cachep->gfpflags = 0;
@@ -1786,7 +1802,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 			 * the creation of further caches will BUG().
 			 */
 			cachep->array[smp_processor_id()] =
-				&initarray_generic.cache;
+			    &initarray_generic.cache;
 
 			/* If the cache that's used by
 			 * kmalloc(sizeof(kmem_list3)) is the first cache,
@@ -1800,8 +1816,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 				g_cpucache_up = PARTIAL_AC;
 		} else {
 			cachep->array[smp_processor_id()] =
-				kmalloc(sizeof(struct arraycache_init),
-						GFP_KERNEL);
+			    kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
 
 			if (g_cpucache_up == PARTIAL_AC) {
 				set_up_list3s(cachep, SIZE_L3);
@@ -1811,16 +1826,18 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 				for_each_online_node(node) {
 
 					cachep->nodelists[node] =
-						kmalloc_node(sizeof(struct kmem_list3),
-								GFP_KERNEL, node);
+					    kmalloc_node(sizeof
+							 (struct kmem_list3),
+							 GFP_KERNEL, node);
 					BUG_ON(!cachep->nodelists[node]);
-					kmem_list3_init(cachep->nodelists[node]);
+					kmem_list3_init(cachep->
+							nodelists[node]);
 				}
 			}
 		}
 		cachep->nodelists[numa_node_id()]->next_reap =
-			jiffies + REAPTIMEOUT_LIST3 +
-			((unsigned long)cachep)%REAPTIMEOUT_LIST3;
+		    jiffies + REAPTIMEOUT_LIST3 +
+		    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
 
 		BUG_ON(!ac_data(cachep));
 		ac_data(cachep)->avail = 0;
@@ -1829,15 +1846,15 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 		ac_data(cachep)->touched = 0;
 		cachep->batchcount = 1;
 		cachep->limit = BOOT_CPUCACHE_ENTRIES;
-	} 
+	}
 
 	/* cache setup completed, link it into the list */
 	list_add(&cachep->next, &cache_chain);
 	unlock_cpu_hotplug();
-oops:
+      oops:
 	if (!cachep && (flags & SLAB_PANIC))
 		panic("kmem_cache_create(): failed to create slab `%s'\n",
-			name);
+		      name);
 	up(&cache_chain_sem);
 	return cachep;
 }
@@ -1880,7 +1897,7 @@ static inline void check_spinlock_acquired_node(kmem_cache_t *cachep, int node)
 /*
  * Waits for all CPUs to execute func().
  */
-static void smp_call_function_all_cpus(void (*func) (void *arg), void *arg)
+static void smp_call_function_all_cpus(void (*func)(void *arg), void *arg)
 {
 	check_irq_on();
 	preempt_disable();
@@ -1895,12 +1912,12 @@ static void smp_call_function_all_cpus(void (*func) (void *arg), void *arg)
 	preempt_enable();
 }
 
-static void drain_array_locked(kmem_cache_t* cachep,
-				struct array_cache *ac, int force, int node);
+static void drain_array_locked(kmem_cache_t *cachep, struct array_cache *ac,
+				int force, int node);
 
 static void do_drain(void *arg)
 {
-	kmem_cache_t *cachep = (kmem_cache_t*)arg;
+	kmem_cache_t *cachep = (kmem_cache_t *) arg;
 	struct array_cache *ac;
 	int node = numa_node_id();
 
@@ -1920,7 +1937,7 @@ static void drain_cpu_caches(kmem_cache_t *cachep)
 	smp_call_function_all_cpus(do_drain, cachep);
 	check_irq_on();
 	spin_lock_irq(&cachep->spinlock);
-	for_each_online_node(node)  {
+	for_each_online_node(node) {
 		l3 = cachep->nodelists[node];
 		if (l3) {
 			spin_lock(&l3->list_lock);
@@ -1958,8 +1975,7 @@ static int __node_shrink(kmem_cache_t *cachep, int node)
 		slab_destroy(cachep, slabp);
 		spin_lock_irq(&l3->list_lock);
 	}
-	ret = !list_empty(&l3->slabs_full) ||
-		!list_empty(&l3->slabs_partial);
+	ret = !list_empty(&l3->slabs_full) || !list_empty(&l3->slabs_partial);
 	return ret;
 }
 
@@ -2015,7 +2031,7 @@ EXPORT_SYMBOL(kmem_cache_shrink);
  * The caller must guarantee that noone will allocate memory from the cache
  * during the kmem_cache_destroy().
  */
-int kmem_cache_destroy(kmem_cache_t * cachep)
+int kmem_cache_destroy(kmem_cache_t *cachep)
 {
 	int i;
 	struct kmem_list3 *l3;
@@ -2037,7 +2053,7 @@ int kmem_cache_destroy(kmem_cache_t * cachep)
 	if (__cache_shrink(cachep)) {
 		slab_error(cachep, "Can't free all objects");
 		down(&cache_chain_sem);
-		list_add(&cachep->next,&cache_chain);
+		list_add(&cachep->next, &cache_chain);
 		up(&cache_chain_sem);
 		unlock_cpu_hotplug();
 		return 1;
@@ -2047,7 +2063,7 @@ int kmem_cache_destroy(kmem_cache_t * cachep)
 		synchronize_rcu();
 
 	for_each_online_cpu(i)
-		kfree(cachep->array[i]);
+	    kfree(cachep->array[i]);
 
 	/* NUMA: free the list3 structures */
 	for_each_online_node(i) {
@@ -2066,39 +2082,39 @@ int kmem_cache_destroy(kmem_cache_t * cachep)
 EXPORT_SYMBOL(kmem_cache_destroy);
 
 /* Get the memory for a slab management obj. */
-static struct slab* alloc_slabmgmt(kmem_cache_t *cachep, void *objp,
-			int colour_off, gfp_t local_flags)
+static struct slab *alloc_slabmgmt(kmem_cache_t *cachep, void *objp,
+				   int colour_off, gfp_t local_flags)
 {
 	struct slab *slabp;
-	
+
 	if (OFF_SLAB(cachep)) {
 		/* Slab management obj is off-slab. */
 		slabp = kmem_cache_alloc(cachep->slabp_cache, local_flags);
 		if (!slabp)
 			return NULL;
 	} else {
-		slabp = objp+colour_off;
+		slabp = objp + colour_off;
 		colour_off += cachep->slab_size;
 	}
 	slabp->inuse = 0;
 	slabp->colouroff = colour_off;
-	slabp->s_mem = objp+colour_off;
+	slabp->s_mem = objp + colour_off;
 
 	return slabp;
 }
 
 static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
 {
-	return (kmem_bufctl_t *)(slabp+1);
+	return (kmem_bufctl_t *) (slabp + 1);
 }
 
 static void cache_init_objs(kmem_cache_t *cachep,
-			struct slab *slabp, unsigned long ctor_flags)
+			    struct slab *slabp, unsigned long ctor_flags)
 {
 	int i;
 
 	for (i = 0; i < cachep->num; i++) {
-		void *objp = slabp->s_mem+cachep->objsize*i;
+		void *objp = slabp->s_mem + cachep->objsize * i;
 #if DEBUG
 		/* need to poison the objs? */
 		if (cachep->flags & SLAB_POISON)
@@ -2116,25 +2132,28 @@ static void cache_init_objs(kmem_cache_t *cachep,
 		 * Otherwise, deadlock. They must also be threaded.
 		 */
 		if (cachep->ctor && !(cachep->flags & SLAB_POISON))
-			cachep->ctor(objp+obj_dbghead(cachep), cachep, ctor_flags);
+			cachep->ctor(objp + obj_dbghead(cachep), cachep,
+				     ctor_flags);
 
 		if (cachep->flags & SLAB_RED_ZONE) {
 			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
 				slab_error(cachep, "constructor overwrote the"
-							" end of an object");
+					   " end of an object");
 			if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
 				slab_error(cachep, "constructor overwrote the"
-							" start of an object");
+					   " start of an object");
 		}
-		if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
-	       		kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0);
+		if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)
+		    && cachep->flags & SLAB_POISON)
+			kernel_map_pages(virt_to_page(objp),
+					 cachep->objsize / PAGE_SIZE, 0);
 #else
 		if (cachep->ctor)
 			cachep->ctor(objp, cachep, ctor_flags);
 #endif
-		slab_bufctl(slabp)[i] = i+1;
+		slab_bufctl(slabp)[i] = i + 1;
 	}
-	slab_bufctl(slabp)[i-1] = BUFCTL_END;
+	slab_bufctl(slabp)[i - 1] = BUFCTL_END;
 	slabp->free = 0;
 }
 
@@ -2170,17 +2189,17 @@ static void set_slab_attr(kmem_cache_t *cachep, struct slab *slabp, void *objp)
  */
 static int cache_grow(kmem_cache_t *cachep, gfp_t flags, int nodeid)
 {
-	struct slab	*slabp;
-	void		*objp;
-	size_t		 offset;
-	gfp_t	 	 local_flags;
-	unsigned long	 ctor_flags;
+	struct slab *slabp;
+	void *objp;
+	size_t offset;
+	gfp_t local_flags;
+	unsigned long ctor_flags;
 	struct kmem_list3 *l3;
 
 	/* Be lazy and only check for valid flags here,
- 	 * keeping it out of the critical path in kmem_cache_alloc().
+	 * keeping it out of the critical path in kmem_cache_alloc().
 	 */
-	if (flags & ~(SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW))
+	if (flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW))
 		BUG();
 	if (flags & SLAB_NO_GROW)
 		return 0;
@@ -2246,9 +2265,9 @@ static int cache_grow(kmem_cache_t *cachep, gfp_t flags, int nodeid)
 	l3->free_objects += cachep->num;
 	spin_unlock(&l3->list_lock);
 	return 1;
-opps1:
+      opps1:
 	kmem_freepages(cachep, objp);
-failed:
+      failed:
 	if (local_flags & __GFP_WAIT)
 		local_irq_disable();
 	return 0;
@@ -2268,18 +2287,19 @@ static void kfree_debugcheck(const void *objp)
 
 	if (!virt_addr_valid(objp)) {
 		printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n",
-			(unsigned long)objp);	
-		BUG();	
+		       (unsigned long)objp);
+		BUG();
 	}
 	page = virt_to_page(objp);
 	if (!PageSlab(page)) {
-		printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n", (unsigned long)objp);
+		printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n",
+		       (unsigned long)objp);
 		BUG();
 	}
 }
 
 static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp,
-					void *caller)
+				   void *caller)
 {
 	struct page *page;
 	unsigned int objnr;
@@ -2290,20 +2310,26 @@ static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp,
 	page = virt_to_page(objp);
 
 	if (page_get_cache(page) != cachep) {
-		printk(KERN_ERR "mismatch in kmem_cache_free: expected cache %p, got %p\n",
-				page_get_cache(page),cachep);
+		printk(KERN_ERR
+		       "mismatch in kmem_cache_free: expected cache %p, got %p\n",
+		       page_get_cache(page), cachep);
 		printk(KERN_ERR "%p is %s.\n", cachep, cachep->name);
-		printk(KERN_ERR "%p is %s.\n", page_get_cache(page), page_get_cache(page)->name);
+		printk(KERN_ERR "%p is %s.\n", page_get_cache(page),
+		       page_get_cache(page)->name);
 		WARN_ON(1);
 	}
 	slabp = page_get_slab(page);
 
 	if (cachep->flags & SLAB_RED_ZONE) {
-		if (*dbg_redzone1(cachep, objp) != RED_ACTIVE || *dbg_redzone2(cachep, objp) != RED_ACTIVE) {
-			slab_error(cachep, "double free, or memory outside"
-						" object was overwritten");
-			printk(KERN_ERR "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
-					objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp));
+		if (*dbg_redzone1(cachep, objp) != RED_ACTIVE
+		    || *dbg_redzone2(cachep, objp) != RED_ACTIVE) {
+			slab_error(cachep,
+				   "double free, or memory outside"
+				   " object was overwritten");
+			printk(KERN_ERR
+			       "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
+			       objp, *dbg_redzone1(cachep, objp),
+			       *dbg_redzone2(cachep, objp));
 		}
 		*dbg_redzone1(cachep, objp) = RED_INACTIVE;
 		*dbg_redzone2(cachep, objp) = RED_INACTIVE;
@@ -2311,30 +2337,31 @@ static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp,
 	if (cachep->flags & SLAB_STORE_USER)
 		*dbg_userword(cachep, objp) = caller;
 
-	objnr = (objp-slabp->s_mem)/cachep->objsize;
+	objnr = (objp - slabp->s_mem) / cachep->objsize;
 
 	BUG_ON(objnr >= cachep->num);
-	BUG_ON(objp != slabp->s_mem + objnr*cachep->objsize);
+	BUG_ON(objp != slabp->s_mem + objnr * cachep->objsize);
 
 	if (cachep->flags & SLAB_DEBUG_INITIAL) {
 		/* Need to call the slab's constructor so the
 		 * caller can perform a verify of its state (debugging).
 		 * Called without the cache-lock held.
 		 */
-		cachep->ctor(objp+obj_dbghead(cachep),
-					cachep, SLAB_CTOR_CONSTRUCTOR|SLAB_CTOR_VERIFY);
+		cachep->ctor(objp + obj_dbghead(cachep),
+			     cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY);
 	}
 	if (cachep->flags & SLAB_POISON && cachep->dtor) {
 		/* we want to cache poison the object,
 		 * call the destruction callback
 		 */
-		cachep->dtor(objp+obj_dbghead(cachep), cachep, 0);
+		cachep->dtor(objp + obj_dbghead(cachep), cachep, 0);
 	}
 	if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
 		if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) {
 			store_stackinfo(cachep, objp, (unsigned long)caller);
-	       		kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0);
+			kernel_map_pages(virt_to_page(objp),
+					 cachep->objsize / PAGE_SIZE, 0);
 		} else {
 			poison_obj(cachep, objp, POISON_FREE);
 		}
@@ -2349,7 +2376,7 @@ static void check_slabp(kmem_cache_t *cachep, struct slab *slabp)
 {
 	kmem_bufctl_t i;
 	int entries = 0;
-	
+
 	/* Check slab's freelist to see if this obj is there. */
 	for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
 		entries++;
@@ -2357,13 +2384,16 @@ static void check_slabp(kmem_cache_t *cachep, struct slab *slabp)
 			goto bad;
 	}
 	if (entries != cachep->num - slabp->inuse) {
-bad:
-		printk(KERN_ERR "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n",
-				cachep->name, cachep->num, slabp, slabp->inuse);
-		for (i=0;i<sizeof(slabp)+cachep->num*sizeof(kmem_bufctl_t);i++) {
-			if ((i%16)==0)
+	      bad:
+		printk(KERN_ERR
+		       "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n",
+		       cachep->name, cachep->num, slabp, slabp->inuse);
+		for (i = 0;
+		     i < sizeof(slabp) + cachep->num * sizeof(kmem_bufctl_t);
+		     i++) {
+			if ((i % 16) == 0)
 				printk("\n%03x:", i);
-			printk(" %02x", ((unsigned char*)slabp)[i]);
+			printk(" %02x", ((unsigned char *)slabp)[i]);
 		}
 		printk("\n");
 		BUG();
@@ -2383,7 +2413,7 @@ static void *cache_alloc_refill(kmem_cache_t *cachep, gfp_t flags)
 
 	check_irq_off();
 	ac = ac_data(cachep);
-retry:
+      retry:
 	batchcount = ac->batchcount;
 	if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
 		/* if there was little recent activity on this
@@ -2405,8 +2435,8 @@ retry:
 			shared_array->avail -= batchcount;
 			ac->avail = batchcount;
 			memcpy(ac->entry,
-				&(shared_array->entry[shared_array->avail]),
-				sizeof(void*)*batchcount);
+			       &(shared_array->entry[shared_array->avail]),
+			       sizeof(void *) * batchcount);
 			shared_array->touched = 1;
 			goto alloc_done;
 		}
@@ -2434,7 +2464,7 @@ retry:
 
 			/* get obj pointer */
 			ac->entry[ac->avail++] = slabp->s_mem +
-				slabp->free*cachep->objsize;
+			    slabp->free * cachep->objsize;
 
 			slabp->inuse++;
 			next = slab_bufctl(slabp)[slabp->free];
@@ -2442,7 +2472,7 @@ retry:
 			slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
 			WARN_ON(numa_node_id() != slabp->nodeid);
 #endif
-		       	slabp->free = next;
+			slabp->free = next;
 		}
 		check_slabp(cachep, slabp);
 
@@ -2454,9 +2484,9 @@ retry:
 			list_add(&slabp->list, &l3->slabs_partial);
 	}
 
-must_grow:
+      must_grow:
 	l3->free_objects -= ac->avail;
-alloc_done:
+      alloc_done:
 	spin_unlock(&l3->list_lock);
 
 	if (unlikely(!ac->avail)) {
@@ -2468,7 +2498,7 @@ alloc_done:
 		if (!x && ac->avail == 0)	// no objects in sight? abort
 			return NULL;
 
-		if (!ac->avail)		// objects refilled by interrupt?
+		if (!ac->avail)	// objects refilled by interrupt?
 			goto retry;
 	}
 	ac->touched = 1;
@@ -2485,16 +2515,16 @@ cache_alloc_debugcheck_before(kmem_cache_t *cachep, gfp_t flags)
 }
 
 #if DEBUG
-static void *
-cache_alloc_debugcheck_after(kmem_cache_t *cachep,
-			gfp_t flags, void *objp, void *caller)
+static void *cache_alloc_debugcheck_after(kmem_cache_t *cachep, gfp_t flags,
+					void *objp, void *caller)
 {
-	if (!objp)	
+	if (!objp)
 		return objp;
- 	if (cachep->flags & SLAB_POISON) {
+	if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
 		if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
-			kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 1);
+			kernel_map_pages(virt_to_page(objp),
+					 cachep->objsize / PAGE_SIZE, 1);
 		else
 			check_poison_obj(cachep, objp);
 #else
@@ -2506,24 +2536,28 @@ cache_alloc_debugcheck_after(kmem_cache_t *cachep,
 		*dbg_userword(cachep, objp) = caller;
 
 	if (cachep->flags & SLAB_RED_ZONE) {
-		if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || *dbg_redzone2(cachep, objp) != RED_INACTIVE) {
-			slab_error(cachep, "double free, or memory outside"
-						" object was overwritten");
-			printk(KERN_ERR "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
-					objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp));
+		if (*dbg_redzone1(cachep, objp) != RED_INACTIVE
+		    || *dbg_redzone2(cachep, objp) != RED_INACTIVE) {
+			slab_error(cachep,
+				   "double free, or memory outside"
+				   " object was overwritten");
+			printk(KERN_ERR
+			       "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
+			       objp, *dbg_redzone1(cachep, objp),
+			       *dbg_redzone2(cachep, objp));
 		}
 		*dbg_redzone1(cachep, objp) = RED_ACTIVE;
 		*dbg_redzone2(cachep, objp) = RED_ACTIVE;
 	}
 	objp += obj_dbghead(cachep);
 	if (cachep->ctor && cachep->flags & SLAB_POISON) {
-		unsigned long	ctor_flags = SLAB_CTOR_CONSTRUCTOR;
+		unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR;
 
 		if (!(flags & __GFP_WAIT))
 			ctor_flags |= SLAB_CTOR_ATOMIC;
 
 		cachep->ctor(objp, cachep, ctor_flags);
-	}	
+	}
 	return objp;
 }
 #else
@@ -2532,7 +2566,7 @@ cache_alloc_debugcheck_after(kmem_cache_t *cachep,
 
 static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags)
 {
-	void* objp;
+	void *objp;
 	struct array_cache *ac;
 
 	check_irq_off();
@@ -2551,7 +2585,7 @@ static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags)
 static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags)
 {
 	unsigned long save_flags;
-	void* objp;
+	void *objp;
 
 	cache_alloc_debugcheck_before(cachep, flags);
 
@@ -2559,7 +2593,7 @@ static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags)
 	objp = ____cache_alloc(cachep, flags);
 	local_irq_restore(save_flags);
 	objp = cache_alloc_debugcheck_after(cachep, flags, objp,
-					__builtin_return_address(0));
+					    __builtin_return_address(0));
 	prefetchw(objp);
 	return objp;
 }
@@ -2571,74 +2605,75 @@ static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags)
 static void *__cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
 {
 	struct list_head *entry;
- 	struct slab *slabp;
- 	struct kmem_list3 *l3;
- 	void *obj;
- 	kmem_bufctl_t next;
- 	int x;
-
- 	l3 = cachep->nodelists[nodeid];
- 	BUG_ON(!l3);
-
-retry:
- 	spin_lock(&l3->list_lock);
- 	entry = l3->slabs_partial.next;
- 	if (entry == &l3->slabs_partial) {
- 		l3->free_touched = 1;
- 		entry = l3->slabs_free.next;
- 		if (entry == &l3->slabs_free)
- 			goto must_grow;
- 	}
-
- 	slabp = list_entry(entry, struct slab, list);
- 	check_spinlock_acquired_node(cachep, nodeid);
- 	check_slabp(cachep, slabp);
-
- 	STATS_INC_NODEALLOCS(cachep);
- 	STATS_INC_ACTIVE(cachep);
- 	STATS_SET_HIGH(cachep);
-
- 	BUG_ON(slabp->inuse == cachep->num);
-
- 	/* get obj pointer */
- 	obj =  slabp->s_mem + slabp->free*cachep->objsize;
- 	slabp->inuse++;
- 	next = slab_bufctl(slabp)[slabp->free];
+	struct slab *slabp;
+	struct kmem_list3 *l3;
+	void *obj;
+	kmem_bufctl_t next;
+	int x;
+
+	l3 = cachep->nodelists[nodeid];
+	BUG_ON(!l3);
+
+      retry:
+	spin_lock(&l3->list_lock);
+	entry = l3->slabs_partial.next;
+	if (entry == &l3->slabs_partial) {
+		l3->free_touched = 1;
+		entry = l3->slabs_free.next;
+		if (entry == &l3->slabs_free)
+			goto must_grow;
+	}
+
+	slabp = list_entry(entry, struct slab, list);
+	check_spinlock_acquired_node(cachep, nodeid);
+	check_slabp(cachep, slabp);
+
+	STATS_INC_NODEALLOCS(cachep);
+	STATS_INC_ACTIVE(cachep);
+	STATS_SET_HIGH(cachep);
+
+	BUG_ON(slabp->inuse == cachep->num);
+
+	/* get obj pointer */
+	obj = slabp->s_mem + slabp->free * cachep->objsize;
+	slabp->inuse++;
+	next = slab_bufctl(slabp)[slabp->free];
 #if DEBUG
- 	slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
+	slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
 #endif
- 	slabp->free = next;
- 	check_slabp(cachep, slabp);
- 	l3->free_objects--;
- 	/* move slabp to correct slabp list: */
- 	list_del(&slabp->list);
-
- 	if (slabp->free == BUFCTL_END) {
- 		list_add(&slabp->list, &l3->slabs_full);
- 	} else {
- 		list_add(&slabp->list, &l3->slabs_partial);
- 	}
+	slabp->free = next;
+	check_slabp(cachep, slabp);
+	l3->free_objects--;
+	/* move slabp to correct slabp list: */
+	list_del(&slabp->list);
+
+	if (slabp->free == BUFCTL_END) {
+		list_add(&slabp->list, &l3->slabs_full);
+	} else {
+		list_add(&slabp->list, &l3->slabs_partial);
+	}
 
- 	spin_unlock(&l3->list_lock);
- 	goto done;
+	spin_unlock(&l3->list_lock);
+	goto done;
 
-must_grow:
- 	spin_unlock(&l3->list_lock);
- 	x = cache_grow(cachep, flags, nodeid);
+      must_grow:
+	spin_unlock(&l3->list_lock);
+	x = cache_grow(cachep, flags, nodeid);
 
- 	if (!x)
- 		return NULL;
+	if (!x)
+		return NULL;
 
- 	goto retry;
-done:
- 	return obj;
+	goto retry;
+      done:
+	return obj;
 }
 #endif
 
 /*
  * Caller needs to acquire correct kmem_list's list_lock
  */
-static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects, int node)
+static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects,
+		       int node)
 {
 	int i;
 	struct kmem_list3 *l3;
@@ -2661,7 +2696,7 @@ static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects, int n
 
 		if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
 			printk(KERN_ERR "slab: double free detected in cache "
-					"'%s', objp %p\n", cachep->name, objp);
+			       "'%s', objp %p\n", cachep->name, objp);
 			BUG();
 		}
 #endif
@@ -2705,20 +2740,19 @@ static void cache_flusharray(kmem_cache_t *cachep, struct array_cache *ac)
 	spin_lock(&l3->list_lock);
 	if (l3->shared) {
 		struct array_cache *shared_array = l3->shared;
-		int max = shared_array->limit-shared_array->avail;
+		int max = shared_array->limit - shared_array->avail;
 		if (max) {
 			if (batchcount > max)
 				batchcount = max;
 			memcpy(&(shared_array->entry[shared_array->avail]),
-					ac->entry,
-					sizeof(void*)*batchcount);
+			       ac->entry, sizeof(void *) * batchcount);
 			shared_array->avail += batchcount;
 			goto free_done;
 		}
 	}
 
 	free_block(cachep, ac->entry, batchcount, node);
-free_done:
+      free_done:
 #if STATS
 	{
 		int i = 0;
@@ -2740,10 +2774,9 @@ free_done:
 	spin_unlock(&l3->list_lock);
 	ac->avail -= batchcount;
 	memmove(ac->entry, &(ac->entry[batchcount]),
-			sizeof(void*)*ac->avail);
+		sizeof(void *) * ac->avail);
 }
 
-
 /*
  * __cache_free
  * Release an obj back to its cache. If the obj has a constructed
@@ -2768,7 +2801,8 @@ static inline void __cache_free(kmem_cache_t *cachep, void *objp)
 		if (unlikely(slabp->nodeid != numa_node_id())) {
 			struct array_cache *alien = NULL;
 			int nodeid = slabp->nodeid;
-			struct kmem_list3 *l3 = cachep->nodelists[numa_node_id()];
+			struct kmem_list3 *l3 =
+			    cachep->nodelists[numa_node_id()];
 
 			STATS_INC_NODEFREES(cachep);
 			if (l3->alien && l3->alien[nodeid]) {
@@ -2776,15 +2810,15 @@ static inline void __cache_free(kmem_cache_t *cachep, void *objp)
 				spin_lock(&alien->lock);
 				if (unlikely(alien->avail == alien->limit))
 					__drain_alien_cache(cachep,
-							alien, nodeid);
+							    alien, nodeid);
 				alien->entry[alien->avail++] = objp;
 				spin_unlock(&alien->lock);
 			} else {
 				spin_lock(&(cachep->nodelists[nodeid])->
-						list_lock);
+					  list_lock);
 				free_block(cachep, &objp, 1, nodeid);
 				spin_unlock(&(cachep->nodelists[nodeid])->
-						list_lock);
+					    list_lock);
 			}
 			return;
 		}
@@ -2831,9 +2865,9 @@ EXPORT_SYMBOL(kmem_cache_alloc);
  */
 int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr)
 {
-	unsigned long addr = (unsigned long) ptr;
+	unsigned long addr = (unsigned long)ptr;
 	unsigned long min_addr = PAGE_OFFSET;
-	unsigned long align_mask = BYTES_PER_WORD-1;
+	unsigned long align_mask = BYTES_PER_WORD - 1;
 	unsigned long size = cachep->objsize;
 	struct page *page;
 
@@ -2853,7 +2887,7 @@ int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr)
 	if (unlikely(page_get_cache(page) != cachep))
 		goto out;
 	return 1;
-out:
+      out:
 	return 0;
 }
 
@@ -2880,8 +2914,10 @@ void *kmem_cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
 
 	if (unlikely(!cachep->nodelists[nodeid])) {
 		/* Fall back to __cache_alloc if we run into trouble */
-		printk(KERN_WARNING "slab: not allocating in inactive node %d for cache %s\n", nodeid, cachep->name);
-		return __cache_alloc(cachep,flags);
+		printk(KERN_WARNING
+		       "slab: not allocating in inactive node %d for cache %s\n",
+		       nodeid, cachep->name);
+		return __cache_alloc(cachep, flags);
 	}
 
 	cache_alloc_debugcheck_before(cachep, flags);
@@ -2891,7 +2927,9 @@ void *kmem_cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
 	else
 		ptr = __cache_alloc_node(cachep, flags, nodeid);
 	local_irq_restore(save_flags);
-	ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, __builtin_return_address(0));
+	ptr =
+	    cache_alloc_debugcheck_after(cachep, flags, ptr,
+					 __builtin_return_address(0));
 
 	return ptr;
 }
@@ -2957,7 +2995,7 @@ EXPORT_SYMBOL(__kmalloc);
 void *__alloc_percpu(size_t size)
 {
 	int i;
-	struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL);
+	struct percpu_data *pdata = kmalloc(sizeof(*pdata), GFP_KERNEL);
 
 	if (!pdata)
 		return NULL;
@@ -2981,9 +3019,9 @@ void *__alloc_percpu(size_t size)
 	}
 
 	/* Catch derefs w/o wrappers */
-	return (void *) (~(unsigned long) pdata);
+	return (void *)(~(unsigned long)pdata);
 
-unwind_oom:
+      unwind_oom:
 	while (--i >= 0) {
 		if (!cpu_possible(i))
 			continue;
@@ -3046,7 +3084,7 @@ void kfree(const void *objp)
 	local_irq_save(flags);
 	kfree_debugcheck(objp);
 	c = page_get_cache(virt_to_page(objp));
-	__cache_free(c, (void*)objp);
+	__cache_free(c, (void *)objp);
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL(kfree);
@@ -3059,17 +3097,16 @@ EXPORT_SYMBOL(kfree);
  * Don't free memory not originally allocated by alloc_percpu()
  * The complemented objp is to check for that.
  */
-void
-free_percpu(const void *objp)
+void free_percpu(const void *objp)
 {
 	int i;
-	struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp);
+	struct percpu_data *p = (struct percpu_data *)(~(unsigned long)objp);
 
 	/*
 	 * We allocate for all cpus so we cannot use for online cpu here.
 	 */
 	for_each_cpu(i)
-		kfree(p->ptrs[i]);
+	    kfree(p->ptrs[i]);
 	kfree(p);
 }
 EXPORT_SYMBOL(free_percpu);
@@ -3103,44 +3140,44 @@ static int alloc_kmemlist(kmem_cache_t *cachep)
 		if (!(new_alien = alloc_alien_cache(node, cachep->limit)))
 			goto fail;
 #endif
-		if (!(new = alloc_arraycache(node, (cachep->shared*
-				cachep->batchcount), 0xbaadf00d)))
+		if (!(new = alloc_arraycache(node, (cachep->shared *
+						    cachep->batchcount),
+					     0xbaadf00d)))
 			goto fail;
 		if ((l3 = cachep->nodelists[node])) {
 
 			spin_lock_irq(&l3->list_lock);
 
 			if ((nc = cachep->nodelists[node]->shared))
-				free_block(cachep, nc->entry,
-							nc->avail, node);
+				free_block(cachep, nc->entry, nc->avail, node);
 
 			l3->shared = new;
 			if (!cachep->nodelists[node]->alien) {
 				l3->alien = new_alien;
 				new_alien = NULL;
 			}
-			l3->free_limit = (1 + nr_cpus_node(node))*
-				cachep->batchcount + cachep->num;
+			l3->free_limit = (1 + nr_cpus_node(node)) *
+			    cachep->batchcount + cachep->num;
 			spin_unlock_irq(&l3->list_lock);
 			kfree(nc);
 			free_alien_cache(new_alien);
 			continue;
 		}
 		if (!(l3 = kmalloc_node(sizeof(struct kmem_list3),
-						GFP_KERNEL, node)))
+					GFP_KERNEL, node)))
 			goto fail;
 
 		kmem_list3_init(l3);
 		l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
-			((unsigned long)cachep)%REAPTIMEOUT_LIST3;
+		    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
 		l3->shared = new;
 		l3->alien = new_alien;
-		l3->free_limit = (1 + nr_cpus_node(node))*
-			cachep->batchcount + cachep->num;
+		l3->free_limit = (1 + nr_cpus_node(node)) *
+		    cachep->batchcount + cachep->num;
 		cachep->nodelists[node] = l3;
 	}
 	return err;
-fail:
+      fail:
 	err = -ENOMEM;
 	return err;
 }
@@ -3162,18 +3199,19 @@ static void do_ccupdate_local(void *info)
 	new->new[smp_processor_id()] = old;
 }
 
-
 static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount,
-				int shared)
+			    int shared)
 {
 	struct ccupdate_struct new;
 	int i, err;
 
-	memset(&new.new,0,sizeof(new.new));
+	memset(&new.new, 0, sizeof(new.new));
 	for_each_online_cpu(i) {
-		new.new[i] = alloc_arraycache(cpu_to_node(i), limit, batchcount);
+		new.new[i] =
+		    alloc_arraycache(cpu_to_node(i), limit, batchcount);
 		if (!new.new[i]) {
-			for (i--; i >= 0; i--) kfree(new.new[i]);
+			for (i--; i >= 0; i--)
+				kfree(new.new[i]);
 			return -ENOMEM;
 		}
 	}
@@ -3201,13 +3239,12 @@ static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount,
 	err = alloc_kmemlist(cachep);
 	if (err) {
 		printk(KERN_ERR "alloc_kmemlist failed for %s, error %d.\n",
-				cachep->name, -err);
+		       cachep->name, -err);
 		BUG();
 	}
 	return 0;
 }
 
-
 static void enable_cpucache(kmem_cache_t *cachep)
 {
 	int err;
@@ -3254,14 +3291,14 @@ static void enable_cpucache(kmem_cache_t *cachep)
 	if (limit > 32)
 		limit = 32;
 #endif
-	err = do_tune_cpucache(cachep, limit, (limit+1)/2, shared);
+	err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared);
 	if (err)
 		printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
-					cachep->name, -err);
+		       cachep->name, -err);
 }
 
-static void drain_array_locked(kmem_cache_t *cachep,
-				struct array_cache *ac, int force, int node)
+static void drain_array_locked(kmem_cache_t *cachep, struct array_cache *ac,
+				int force, int node)
 {
 	int tofree;
 
@@ -3269,14 +3306,14 @@ static void drain_array_locked(kmem_cache_t *cachep,
 	if (ac->touched && !force) {
 		ac->touched = 0;
 	} else if (ac->avail) {
-		tofree = force ? ac->avail : (ac->limit+4)/5;
+		tofree = force ? ac->avail : (ac->limit + 4) / 5;
 		if (tofree > ac->avail) {
-			tofree = (ac->avail+1)/2;
+			tofree = (ac->avail + 1) / 2;
 		}
 		free_block(cachep, ac->entry, tofree, node);
 		ac->avail -= tofree;
 		memmove(ac->entry, &(ac->entry[tofree]),
-					sizeof(void*)*ac->avail);
+			sizeof(void *) * ac->avail);
 	}
 }
 
@@ -3299,13 +3336,14 @@ static void cache_reap(void *unused)
 
 	if (down_trylock(&cache_chain_sem)) {
 		/* Give up. Setup the next iteration. */
-		schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
+		schedule_delayed_work(&__get_cpu_var(reap_work),
+				      REAPTIMEOUT_CPUC);
 		return;
 	}
 
 	list_for_each(walk, &cache_chain) {
 		kmem_cache_t *searchp;
-		struct list_head* p;
+		struct list_head *p;
 		int tofree;
 		struct slab *slabp;
 
@@ -3322,7 +3360,7 @@ static void cache_reap(void *unused)
 		spin_lock_irq(&l3->list_lock);
 
 		drain_array_locked(searchp, ac_data(searchp), 0,
-				numa_node_id());
+				   numa_node_id());
 
 		if (time_after(l3->next_reap, jiffies))
 			goto next_unlock;
@@ -3331,14 +3369,16 @@ static void cache_reap(void *unused)
 
 		if (l3->shared)
 			drain_array_locked(searchp, l3->shared, 0,
-				numa_node_id());
+					   numa_node_id());
 
 		if (l3->free_touched) {
 			l3->free_touched = 0;
 			goto next_unlock;
 		}
 
-		tofree = (l3->free_limit+5*searchp->num-1)/(5*searchp->num);
+		tofree =
+		    (l3->free_limit + 5 * searchp->num -
+		     1) / (5 * searchp->num);
 		do {
 			p = l3->slabs_free.next;
 			if (p == &(l3->slabs_free))
@@ -3358,10 +3398,10 @@ static void cache_reap(void *unused)
 			spin_unlock_irq(&l3->list_lock);
 			slab_destroy(searchp, slabp);
 			spin_lock_irq(&l3->list_lock);
-		} while(--tofree > 0);
-next_unlock:
+		} while (--tofree > 0);
+	      next_unlock:
 		spin_unlock_irq(&l3->list_lock);
-next:
+	      next:
 		cond_resched();
 	}
 	check_irq_on();
@@ -3418,7 +3458,7 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
 	kmem_cache_t *cachep = p;
 	++*pos;
 	return cachep->next.next == &cache_chain ? NULL
-		: list_entry(cachep->next.next, kmem_cache_t, next);
+	    : list_entry(cachep->next.next, kmem_cache_t, next);
 }
 
 static void s_stop(struct seq_file *m, void *p)
@@ -3430,11 +3470,11 @@ static int s_show(struct seq_file *m, void *p)
 {
 	kmem_cache_t *cachep = p;
 	struct list_head *q;
-	struct slab	*slabp;
-	unsigned long	active_objs;
-	unsigned long	num_objs;
-	unsigned long	active_slabs = 0;
-	unsigned long	num_slabs, free_objects = 0, shared_avail = 0;
+	struct slab *slabp;
+	unsigned long active_objs;
+	unsigned long num_objs;
+	unsigned long active_slabs = 0;
+	unsigned long num_slabs, free_objects = 0, shared_avail = 0;
 	const char *name;
 	char *error = NULL;
 	int node;
@@ -3451,14 +3491,14 @@ static int s_show(struct seq_file *m, void *p)
 
 		spin_lock(&l3->list_lock);
 
-		list_for_each(q,&l3->slabs_full) {
+		list_for_each(q, &l3->slabs_full) {
 			slabp = list_entry(q, struct slab, list);
 			if (slabp->inuse != cachep->num && !error)
 				error = "slabs_full accounting error";
 			active_objs += cachep->num;
 			active_slabs++;
 		}
-		list_for_each(q,&l3->slabs_partial) {
+		list_for_each(q, &l3->slabs_partial) {
 			slabp = list_entry(q, struct slab, list);
 			if (slabp->inuse == cachep->num && !error)
 				error = "slabs_partial inuse accounting error";
@@ -3467,7 +3507,7 @@ static int s_show(struct seq_file *m, void *p)
 			active_objs += slabp->inuse;
 			active_slabs++;
 		}
-		list_for_each(q,&l3->slabs_free) {
+		list_for_each(q, &l3->slabs_free) {
 			slabp = list_entry(q, struct slab, list);
 			if (slabp->inuse && !error)
 				error = "slabs_free/inuse accounting error";
@@ -3478,25 +3518,24 @@ static int s_show(struct seq_file *m, void *p)
 
 		spin_unlock(&l3->list_lock);
 	}
-	num_slabs+=active_slabs;
-	num_objs = num_slabs*cachep->num;
+	num_slabs += active_slabs;
+	num_objs = num_slabs * cachep->num;
 	if (num_objs - active_objs != free_objects && !error)
 		error = "free_objects accounting error";
 
-	name = cachep->name; 
+	name = cachep->name;
 	if (error)
 		printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
 
 	seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
-		name, active_objs, num_objs, cachep->objsize,
-		cachep->num, (1<<cachep->gfporder));
+		   name, active_objs, num_objs, cachep->objsize,
+		   cachep->num, (1 << cachep->gfporder));
 	seq_printf(m, " : tunables %4u %4u %4u",
-			cachep->limit, cachep->batchcount,
-			cachep->shared);
+		   cachep->limit, cachep->batchcount, cachep->shared);
 	seq_printf(m, " : slabdata %6lu %6lu %6lu",
-			active_slabs, num_slabs, shared_avail);
+		   active_slabs, num_slabs, shared_avail);
 #if STATS
-	{	/* list3 stats */
+	{			/* list3 stats */
 		unsigned long high = cachep->high_mark;
 		unsigned long allocs = cachep->num_allocations;
 		unsigned long grown = cachep->grown;
@@ -3507,9 +3546,7 @@ static int s_show(struct seq_file *m, void *p)
 		unsigned long node_frees = cachep->node_frees;
 
 		seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
-				%4lu %4lu %4lu %4lu",
-				allocs, high, grown, reaped, errors,
-				max_freeable, node_allocs, node_frees);
+				%4lu %4lu %4lu %4lu", allocs, high, grown, reaped, errors, max_freeable, node_allocs, node_frees);
 	}
 	/* cpu stats */
 	{
@@ -3519,7 +3556,7 @@ static int s_show(struct seq_file *m, void *p)
 		unsigned long freemiss = atomic_read(&cachep->freemiss);
 
 		seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu",
-			allochit, allocmiss, freehit, freemiss);
+			   allochit, allocmiss, freehit, freemiss);
 	}
 #endif
 	seq_putc(m, '\n');
@@ -3542,10 +3579,10 @@ static int s_show(struct seq_file *m, void *p)
  */
 
 struct seq_operations slabinfo_op = {
-	.start	= s_start,
-	.next	= s_next,
-	.stop	= s_stop,
-	.show	= s_show,
+	.start = s_start,
+	.next = s_next,
+	.stop = s_stop,
+	.show = s_show,
 };
 
 #define MAX_SLABINFO_WRITE 128
@@ -3556,18 +3593,18 @@ struct seq_operations slabinfo_op = {
  * @count: data length
  * @ppos: unused
  */
-ssize_t slabinfo_write(struct file *file, const char __user *buffer,
-				size_t count, loff_t *ppos)
+ssize_t slabinfo_write(struct file *file, const char __user * buffer,
+		       size_t count, loff_t *ppos)
 {
-	char kbuf[MAX_SLABINFO_WRITE+1], *tmp;
+	char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
 	int limit, batchcount, shared, res;
 	struct list_head *p;
-	
+
 	if (count > MAX_SLABINFO_WRITE)
 		return -EINVAL;
 	if (copy_from_user(&kbuf, buffer, count))
 		return -EFAULT;
-	kbuf[MAX_SLABINFO_WRITE] = '\0'; 
+	kbuf[MAX_SLABINFO_WRITE] = '\0';
 
 	tmp = strchr(kbuf, ' ');
 	if (!tmp)
@@ -3580,18 +3617,17 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
 	/* Find the cache in the chain of caches. */
 	down(&cache_chain_sem);
 	res = -EINVAL;
-	list_for_each(p,&cache_chain) {
+	list_for_each(p, &cache_chain) {
 		kmem_cache_t *cachep = list_entry(p, kmem_cache_t, next);
 
 		if (!strcmp(cachep->name, kbuf)) {
 			if (limit < 1 ||
 			    batchcount < 1 ||
-			    batchcount > limit ||
-			    shared < 0) {
+			    batchcount > limit || shared < 0) {
 				res = 0;
 			} else {
 				res = do_tune_cpucache(cachep, limit,
-							batchcount, shared);
+						       batchcount, shared);
 			}
 			break;
 		}
-- 
cgit v1.2.2


From bec6b0c89b234090681a4516e20ac5debe3e7c59 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Sun, 8 Jan 2006 01:00:38 -0800
Subject: [PATCH] slab: remove nested #ifdef CONFIG_NUMA

For some reason there is an #ifdef CONFIG_NUMA within another #ifdef
CONFIG_NUMA in the page allocator.  Remove innermost #ifdef CONFIG_NUMA

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0b98f428b0..5eeeadd9f6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1883,7 +1883,6 @@ bad:
 
 static inline void free_zone_pagesets(int cpu)
 {
-#ifdef CONFIG_NUMA
 	struct zone *zone;
 
 	for_each_zone(zone) {
@@ -1892,7 +1891,6 @@ static inline void free_zone_pagesets(int cpu)
 		zone_pcp(zone, cpu) = NULL;
 		kfree(pset);
 	}
-#endif
 }
 
 static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
-- 
cgit v1.2.2


From 9d0243bca345d5ce25d3f4b74b7facb3a6df1232 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 8 Jan 2006 01:00:39 -0800
Subject: [PATCH] drop-pagecache

Add /proc/sys/vm/drop_caches.  When written to, this will cause the kernel to
discard as much pagecache and/or reclaimable slab objects as it can.  THis
operation requires root permissions.

It won't drop dirty data, so the user should run `sync' first.

Caveats:

a) Holds inode_lock for exorbitant amounts of time.

b) Needs to be taught about NUMA nodes: propagate these all the way through
   so the discarding can be controlled on a per-node basis.

This is a debugging feature: useful for getting consistent results between
filesystem benchmarks.  We could possibly put it under a config option, but
it's less than 300 bytes.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/filesystems/proc.txt | 17 ++++++++++
 Documentation/sysctl/vm.txt        |  3 +-
 fs/Makefile                        |  2 +-
 fs/drop_caches.c                   | 68 ++++++++++++++++++++++++++++++++++++++
 include/linux/mm.h                 |  7 ++++
 include/linux/sysctl.h             |  1 +
 kernel/sysctl.c                    | 10 ++++++
 mm/truncate.c                      |  1 -
 mm/vmscan.c                        |  3 +-
 9 files changed, 107 insertions(+), 5 deletions(-)
 create mode 100644 fs/drop_caches.c

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index d4773565ea..a4dcf42c2f 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -1302,6 +1302,23 @@ VM has token based thrashing control mechanism and uses the token to prevent
 unnecessary page faults in thrashing situation. The unit of the value is
 second. The value would be useful to tune thrashing behavior.
 
+drop_caches
+-----------
+
+Writing to this will cause the kernel to drop clean caches, dentries and
+inodes from memory, causing that memory to become free.
+
+To free pagecache:
+	echo 1 > /proc/sys/vm/drop_caches
+To free dentries and inodes:
+	echo 2 > /proc/sys/vm/drop_caches
+To free pagecache, dentries and inodes:
+	echo 3 > /proc/sys/vm/drop_caches
+
+As this is a non-destructive operation and dirty objects are not freeable, the
+user should run `sync' first.
+
+
 2.5 /proc/sys/dev - Device specific parameters
 ----------------------------------------------
 
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 2f1aae32a5..89ba1a42a1 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -26,12 +26,13 @@ Currently, these files are in /proc/sys/vm:
 - min_free_kbytes
 - laptop_mode
 - block_dump
+- drop-caches
 
 ==============================================================
 
 dirty_ratio, dirty_background_ratio, dirty_expire_centisecs,
 dirty_writeback_centisecs, vfs_cache_pressure, laptop_mode,
-block_dump, swap_token_timeout:
+block_dump, swap_token_timeout, drop-caches:
 
 See Documentation/filesystems/proc.txt
 
diff --git a/fs/Makefile b/fs/Makefile
index 73676111eb..35e9aec608 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -10,7 +10,7 @@ obj-y :=	open.o read_write.o file_table.o buffer.o  bio.o super.o \
 		ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \
 		attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \
 		seq_file.o xattr.o libfs.o fs-writeback.o mpage.o direct-io.o \
-		ioprio.o pnode.o
+		ioprio.o pnode.o drop_caches.o
 
 obj-$(CONFIG_INOTIFY)		+= inotify.o
 obj-$(CONFIG_EPOLL)		+= eventpoll.o
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
new file mode 100644
index 0000000000..4e4762389b
--- /dev/null
+++ b/fs/drop_caches.c
@@ -0,0 +1,68 @@
+/*
+ * Implement the manual drop-all-pagecache function
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/writeback.h>
+#include <linux/sysctl.h>
+#include <linux/gfp.h>
+
+/* A global variable is a bit ugly, but it keeps the code simple */
+int sysctl_drop_caches;
+
+static void drop_pagecache_sb(struct super_block *sb)
+{
+	struct inode *inode;
+
+	spin_lock(&inode_lock);
+	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+		if (inode->i_state & (I_FREEING|I_WILL_FREE))
+			continue;
+		invalidate_inode_pages(inode->i_mapping);
+	}
+	spin_unlock(&inode_lock);
+}
+
+void drop_pagecache(void)
+{
+	struct super_block *sb;
+
+	spin_lock(&sb_lock);
+restart:
+	list_for_each_entry(sb, &super_blocks, s_list) {
+		sb->s_count++;
+		spin_unlock(&sb_lock);
+		down_read(&sb->s_umount);
+		if (sb->s_root)
+			drop_pagecache_sb(sb);
+		up_read(&sb->s_umount);
+		spin_lock(&sb_lock);
+		if (__put_super_and_need_restart(sb))
+			goto restart;
+	}
+	spin_unlock(&sb_lock);
+}
+
+void drop_slab(void)
+{
+	int nr_objects;
+
+	do {
+		nr_objects = shrink_slab(1000, GFP_KERNEL, 1000);
+	} while (nr_objects > 10);
+}
+
+int drop_caches_sysctl_handler(ctl_table *table, int write,
+	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+	proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+	if (write) {
+		if (sysctl_drop_caches & 1)
+			drop_pagecache();
+		if (sysctl_drop_caches & 2)
+			drop_slab();
+	}
+	return 0;
+}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bc01fff3aa..83c651f251 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1036,5 +1036,12 @@ int in_gate_area_no_task(unsigned long addr);
 /* /proc/<pid>/oom_adj set to -17 protects from the oom-killer */
 #define OOM_DISABLE -17
 
+int drop_caches_sysctl_handler(struct ctl_table *, int, struct file *,
+					void __user *, size_t *, loff_t *);
+int shrink_slab(unsigned long scanned, gfp_t gfp_mask,
+			unsigned long lru_pages);
+void drop_pagecache(void);
+void drop_slab(void);
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index a9b80fc7f0..4cd267fe87 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -180,6 +180,7 @@ enum
 	VM_VFS_CACHE_PRESSURE=26, /* dcache/icache reclaim pressure */
 	VM_LEGACY_VA_LAYOUT=27, /* legacy/compatibility virtual address space layout */
 	VM_SWAP_TOKEN_TIMEOUT=28, /* default time for token time out */
+	VM_DROP_PAGECACHE=29,	/* int: nuke lots of pagecache */
 };
 
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index a85047bb57..8dcf6fd5b0 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -68,6 +68,7 @@ extern int min_free_kbytes;
 extern int printk_ratelimit_jiffies;
 extern int printk_ratelimit_burst;
 extern int pid_max_min, pid_max_max;
+extern int sysctl_drop_caches;
 
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
 int unknown_nmi_panic;
@@ -774,6 +775,15 @@ static ctl_table vm_table[] = {
 		.proc_handler	= &lowmem_reserve_ratio_sysctl_handler,
 		.strategy	= &sysctl_intvec,
 	},
+	{
+		.ctl_name	= VM_DROP_PAGECACHE,
+		.procname	= "drop_caches",
+		.data		= &sysctl_drop_caches,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= drop_caches_sysctl_handler,
+		.strategy	= &sysctl_intvec,
+	},
 	{
 		.ctl_name	= VM_MIN_FREE_KBYTES,
 		.procname	= "min_free_kbytes",
diff --git a/mm/truncate.c b/mm/truncate.c
index 7dee327459..b1a463d0fe 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -249,7 +249,6 @@ unlock:
 				break;
 		}
 		pagevec_release(&pvec);
-		cond_resched();
 	}
 	return ret;
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index be8235fb19..428c5801d4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -180,8 +180,7 @@ EXPORT_SYMBOL(remove_shrinker);
  *
  * Returns the number of slab objects which we shrunk.
  */
-static int shrink_slab(unsigned long scanned, gfp_t gfp_mask,
-			unsigned long lru_pages)
+int shrink_slab(unsigned long scanned, gfp_t gfp_mask, unsigned long lru_pages)
 {
 	struct shrinker *shrinker;
 	int ret = 0;
-- 
cgit v1.2.2


From 8ad4b1fb8205340dba16b63467bb23efc27264d6 Mon Sep 17 00:00:00 2001
From: Rohit Seth <rohit.seth@intel.com>
Date: Sun, 8 Jan 2006 01:00:40 -0800
Subject: [PATCH] Make high and batch sizes of per_cpu_pagelists configurable

As recently there has been lot of traffic on the right values for batch and
high water marks for per_cpu_pagelists.  This patch makes these two
variables configurable through /proc interface.

A new tunable /proc/sys/vm/percpu_pagelist_fraction is added.  This entry
controls the fraction of pages at most in each zone that are allocated for
each per cpu page list.  The min value for this is 8.  It means that we
don't allow more than 1/8th of pages in each zone to be allocated in any
single per_cpu_pagelist.

The batch value of each per cpu pagelist is also updated as a result.  It
is set to pcp->high/4.  The upper limit of batch is (PAGE_SHIFT * 8)

Signed-off-by: Rohit Seth <rohit.seth@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/sysctl/vm.txt | 17 ++++++++++++++++
 include/linux/mmzone.h      |  2 ++
 include/linux/sysctl.h      |  1 +
 kernel/sysctl.c             | 12 +++++++++++
 mm/page_alloc.c             | 49 +++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 81 insertions(+)

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 89ba1a42a1..6910c0136f 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -103,3 +103,20 @@ This is used to force the Linux VM to keep a minimum number
 of kilobytes free.  The VM uses this number to compute a pages_min
 value for each lowmem zone in the system.  Each lowmem zone gets 
 a number of reserved free pages based proportionally on its size.
+
+==============================================================
+
+percpu_pagelist_fraction
+
+This is the fraction of pages at most (high mark pcp->high) in each zone that
+are allocated for each per cpu page list.  The min value for this is 8.  It
+means that we don't allow more than 1/8th of pages in each zone to be
+allocated in any single per_cpu_pagelist.  This entry only changes the value
+of hot per cpu pagelists.  User can specify a number like 100 to allocate
+1/100th of each zone to each per cpu page list.
+
+The batch value of each per cpu pagelist is also updated as a result.  It is
+set to pcp->high/4.  The upper limit of batch is (PAGE_SHIFT * 8)
+
+The initial value is zero.  Kernel does not use this value at boot time to set
+the high water marks for each per cpu page list.
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index c34f4a2c62..2a89c132ba 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -437,6 +437,8 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *, int, struct file *,
 extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1];
 int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *,
 					void __user *, size_t *, loff_t *);
+int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int, struct file *,
+					void __user *, size_t *, loff_t *);
 
 #include <linux/topology.h>
 /* Returns the number of the current Node. */
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 4cd267fe87..7f472127b7 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -181,6 +181,7 @@ enum
 	VM_LEGACY_VA_LAYOUT=27, /* legacy/compatibility virtual address space layout */
 	VM_SWAP_TOKEN_TIMEOUT=28, /* default time for token time out */
 	VM_DROP_PAGECACHE=29,	/* int: nuke lots of pagecache */
+	VM_PERCPU_PAGELIST_FRACTION=30,/* int: fraction of pages in each percpu_pagelist */
 };
 
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8dcf6fd5b0..03b0598f23 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -69,6 +69,7 @@ extern int printk_ratelimit_jiffies;
 extern int printk_ratelimit_burst;
 extern int pid_max_min, pid_max_max;
 extern int sysctl_drop_caches;
+extern int percpu_pagelist_fraction;
 
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
 int unknown_nmi_panic;
@@ -79,6 +80,7 @@ extern int proc_unknown_nmi_panic(ctl_table *, int, struct file *,
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
 static int maxolduid = 65535;
 static int minolduid;
+static int min_percpu_pagelist_fract = 8;
 
 static int ngroups_max = NGROUPS_MAX;
 
@@ -794,6 +796,16 @@ static ctl_table vm_table[] = {
 		.strategy	= &sysctl_intvec,
 		.extra1		= &zero,
 	},
+	{
+		.ctl_name	= VM_PERCPU_PAGELIST_FRACTION,
+		.procname	= "percpu_pagelist_fraction",
+		.data		= &percpu_pagelist_fraction,
+		.maxlen		= sizeof(percpu_pagelist_fraction),
+		.mode		= 0644,
+		.proc_handler	= &percpu_pagelist_fraction_sysctl_handler,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &min_percpu_pagelist_fract,
+	},
 #ifdef CONFIG_MMU
 	{
 		.ctl_name	= VM_MAX_MAP_COUNT,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5eeeadd9f6..2c46f697e8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -53,6 +53,7 @@ struct pglist_data *pgdat_list __read_mostly;
 unsigned long totalram_pages __read_mostly;
 unsigned long totalhigh_pages __read_mostly;
 long nr_swap_pages;
+int percpu_pagelist_fraction;
 
 static void fastcall free_hot_cold_page(struct page *page, int cold);
 
@@ -1831,6 +1832,24 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
 	INIT_LIST_HEAD(&pcp->list);
 }
 
+/*
+ * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
+ * to the value high for the pageset p.
+ */
+
+static void setup_pagelist_highmark(struct per_cpu_pageset *p,
+				unsigned long high)
+{
+	struct per_cpu_pages *pcp;
+
+	pcp = &p->pcp[0]; /* hot list */
+	pcp->high = high;
+	pcp->batch = max(1UL, high/4);
+	if ((high/4) > (PAGE_SHIFT * 8))
+		pcp->batch = PAGE_SHIFT * 8;
+}
+
+
 #ifdef CONFIG_NUMA
 /*
  * Boot pageset table. One per cpu which is going to be used for all
@@ -1868,6 +1887,10 @@ static int __devinit process_zones(int cpu)
 			goto bad;
 
 		setup_pageset(zone->pageset[cpu], zone_batchsize(zone));
+
+		if (percpu_pagelist_fraction)
+			setup_pagelist_highmark(zone_pcp(zone, cpu),
+			 	(zone->present_pages / percpu_pagelist_fraction));
 	}
 
 	return 0;
@@ -2567,6 +2590,32 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
 	return 0;
 }
 
+/*
+ * percpu_pagelist_fraction - changes the pcp->high for each zone on each
+ * cpu.  It is the fraction of total pages in each zone that a hot per cpu pagelist
+ * can have before it gets flushed back to buddy allocator.
+ */
+
+int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
+	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+	struct zone *zone;
+	unsigned int cpu;
+	int ret;
+
+	ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+	if (!write || (ret == -EINVAL))
+		return ret;
+	for_each_zone(zone) {
+		for_each_online_cpu(cpu) {
+			unsigned long  high;
+			high = zone->present_pages / percpu_pagelist_fraction;
+			setup_pagelist_highmark(zone_pcp(zone, cpu), high);
+		}
+	}
+	return 0;
+}
+
 __initdata int hashdist = HASHDIST_DEFAULT;
 
 #ifdef CONFIG_NUMA
-- 
cgit v1.2.2


From 23316bc86fd31c5d644a71c398ec41d9fecacec4 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sun, 8 Jan 2006 01:00:41 -0800
Subject: [PATCH] mm: cleanup zone_pcp

Use zone_pcp everywhere even though NUMA code "knows" the internal details
of the zone.  Stop other people trying to copy, and it looks nicer.

Also, only print the pagesets of online cpus in zoneinfo.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: "Seth, Rohit" <rohit.seth@intel.com>
Cc: Christoph Lameter <christoph@lameter.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2c46f697e8..6b92a945ae 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -597,7 +597,7 @@ void drain_remote_pages(void)
 		if (zone->zone_pgdat->node_id == numa_node_id())
 			continue;
 
-		pset = zone->pageset[smp_processor_id()];
+		pset = zone_pcp(zone, smp_processor_id());
 		for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
 			struct per_cpu_pages *pcp;
 
@@ -1881,12 +1881,12 @@ static int __devinit process_zones(int cpu)
 
 	for_each_zone(zone) {
 
-		zone->pageset[cpu] = kmalloc_node(sizeof(struct per_cpu_pageset),
+		zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
 					 GFP_KERNEL, cpu_to_node(cpu));
-		if (!zone->pageset[cpu])
+		if (!zone_pcp(zone, cpu))
 			goto bad;
 
-		setup_pageset(zone->pageset[cpu], zone_batchsize(zone));
+		setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
 
 		if (percpu_pagelist_fraction)
 			setup_pagelist_highmark(zone_pcp(zone, cpu),
@@ -1898,8 +1898,8 @@ bad:
 	for_each_zone(dzone) {
 		if (dzone == zone)
 			break;
-		kfree(dzone->pageset[cpu]);
-		dzone->pageset[cpu] = NULL;
+		kfree(zone_pcp(dzone, cpu));
+		zone_pcp(dzone, cpu) = NULL;
 	}
 	return -ENOMEM;
 }
@@ -1984,7 +1984,7 @@ static __devinit void zone_pcp_init(struct zone *zone)
 	for (cpu = 0; cpu < NR_CPUS; cpu++) {
 #ifdef CONFIG_NUMA
 		/* Early boot. Slab allocator not functional yet */
-		zone->pageset[cpu] = &boot_pageset[cpu];
+		zone_pcp(zone, cpu) = &boot_pageset[cpu];
 		setup_pageset(&boot_pageset[cpu],0);
 #else
 		setup_pageset(zone_pcp(zone,cpu), batch);
@@ -2227,7 +2227,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
 		seq_printf(m,
 			   ")"
 			   "\n  pagesets");
-		for (i = 0; i < ARRAY_SIZE(zone->pageset); i++) {
+		for_each_online_cpu(i) {
 			struct per_cpu_pageset *pageset;
 			int j;
 
-- 
cgit v1.2.2


From 48db57f8ff10eb09ab887ccb6150b0da0c7be24e Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sun, 8 Jan 2006 01:00:42 -0800
Subject: [PATCH] mm: free_pages opt

Try to streamline free_pages_bulk by ensuring callers don't pass in a
'count' that exceeds the list size.

Some cleanups:
Rename __free_pages_bulk to __free_one_page.
Put the page list manipulation from __free_pages_ok into free_one_page.
Make __free_pages_ok static.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 58 +++++++++++++++++++++++++++++----------------------------
 1 file changed, 30 insertions(+), 28 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6b92a945ae..ad3d0202cd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -308,7 +308,7 @@ static inline int page_is_buddy(struct page *page, int order)
  * -- wli
  */
 
-static inline void __free_pages_bulk (struct page *page,
+static inline void __free_one_page(struct page *page,
 		struct zone *zone, unsigned int order)
 {
 	unsigned long page_idx;
@@ -383,40 +383,42 @@ static inline int free_pages_check(struct page *page)
  * And clear the zone's pages_scanned counter, to hold off the "all pages are
  * pinned" detection logic.
  */
-static int
-free_pages_bulk(struct zone *zone, int count,
-		struct list_head *list, unsigned int order)
+static void free_pages_bulk(struct zone *zone, int count,
+					struct list_head *list, int order)
 {
-	struct page *page = NULL;
-	int ret = 0;
-
 	spin_lock(&zone->lock);
 	zone->all_unreclaimable = 0;
 	zone->pages_scanned = 0;
-	while (!list_empty(list) && count--) {
+	while (count--) {
+		struct page *page;
+
+		BUG_ON(list_empty(list));
 		page = list_entry(list->prev, struct page, lru);
-		/* have to delete it as __free_pages_bulk list manipulates */
+		/* have to delete it as __free_one_page list manipulates */
 		list_del(&page->lru);
-		__free_pages_bulk(page, zone, order);
-		ret++;
+		__free_one_page(page, zone, order);
 	}
 	spin_unlock(&zone->lock);
-	return ret;
 }
 
-void __free_pages_ok(struct page *page, unsigned int order)
+static void free_one_page(struct zone *zone, struct page *page, int order)
 {
-	unsigned long flags;
 	LIST_HEAD(list);
+	list_add(&page->lru, &list);
+	free_pages_bulk(zone, 1, &list, order);
+}
+
+static void __free_pages_ok(struct page *page, unsigned int order)
+{
+	unsigned long flags;
 	int i;
 	int reserved = 0;
 
 	arch_free_page(page, order);
 
 #ifndef CONFIG_MMU
-	if (order > 0)
-		for (i = 1 ; i < (1 << order) ; ++i)
-			__put_page(page + i);
+	for (i = 1 ; i < (1 << order) ; ++i)
+		__put_page(page + i);
 #endif
 
 	for (i = 0 ; i < (1 << order) ; ++i)
@@ -424,11 +426,10 @@ void __free_pages_ok(struct page *page, unsigned int order)
 	if (reserved)
 		return;
 
-	list_add(&page->lru, &list);
-	kernel_map_pages(page, 1<<order, 0);
+	kernel_map_pages(page, 1 << order, 0);
 	local_irq_save(flags);
 	__mod_page_state(pgfree, 1 << order);
-	free_pages_bulk(page_zone(page), 1, &list, order);
+	free_one_page(page_zone(page), page, order);
 	local_irq_restore(flags);
 }
 
@@ -602,9 +603,8 @@ void drain_remote_pages(void)
 			struct per_cpu_pages *pcp;
 
 			pcp = &pset->pcp[i];
-			if (pcp->count)
-				pcp->count -= free_pages_bulk(zone, pcp->count,
-						&pcp->list, 0);
+			free_pages_bulk(zone, pcp->count, &pcp->list, 0);
+			pcp->count = 0;
 		}
 	}
 	local_irq_restore(flags);
@@ -627,8 +627,8 @@ static void __drain_pages(unsigned int cpu)
 
 			pcp = &pset->pcp[i];
 			local_irq_save(flags);
-			pcp->count -= free_pages_bulk(zone, pcp->count,
-						&pcp->list, 0);
+			free_pages_bulk(zone, pcp->count, &pcp->list, 0);
+			pcp->count = 0;
 			local_irq_restore(flags);
 		}
 	}
@@ -719,8 +719,10 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
 	__inc_page_state(pgfree);
 	list_add(&page->lru, &pcp->list);
 	pcp->count++;
-	if (pcp->count >= pcp->high)
-		pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
+	if (pcp->count >= pcp->high) {
+		free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
+		pcp->count -= pcp->batch;
+	}
 	local_irq_restore(flags);
 	put_cpu();
 }
@@ -759,7 +761,7 @@ static struct page *buffered_rmqueue(struct zonelist *zonelist,
 
 again:
 	cpu  = get_cpu();
-	if (order == 0) {
+	if (likely(order == 0)) {
 		struct per_cpu_pages *pcp;
 
 		pcp = &zone_pcp(zone, cpu)->pcp[cold];
-- 
cgit v1.2.2


From 15316ba81aee6775d6079fb46c66c801989e7d10 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Sun, 8 Jan 2006 01:00:43 -0800
Subject: [PATCH] add schedule_on_each_cpu()

swap migration's isolate_lru_page() currently uses an IPI to notify other
processors that the lru caches need to be drained if the page cannot be
found on the LRU.  The IPI interrupt may interrupt a processor that is just
processing lru requests and cause a race condition.

This patch introduces a new function run_on_each_cpu() that uses the
keventd() to run the LRU draining on each processor.  Processors disable
preemption when dealing the LRU caches (these are per processor) and thus
executing LRU draining from another process is safe.

Thanks to Lee Schermerhorn <lee.schermerhorn@hp.com> for finding this race
condition.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/workqueue.h |  1 +
 kernel/workqueue.c        | 19 +++++++++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index ac39d04d02..86b1113002 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -65,6 +65,7 @@ extern int FASTCALL(schedule_work(struct work_struct *work));
 extern int FASTCALL(schedule_delayed_work(struct work_struct *work, unsigned long delay));
 
 extern int schedule_delayed_work_on(int cpu, struct work_struct *work, unsigned long delay);
+extern int schedule_on_each_cpu(void (*func)(void *info), void *info);
 extern void flush_scheduled_work(void);
 extern int current_is_keventd(void);
 extern int keventd_up(void);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 2bd5aee1c7..62d4722069 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -419,6 +419,25 @@ int schedule_delayed_work_on(int cpu,
 	return ret;
 }
 
+int schedule_on_each_cpu(void (*func) (void *info), void *info)
+{
+	int cpu;
+	struct work_struct *work;
+
+	work = kmalloc(NR_CPUS * sizeof(struct work_struct), GFP_KERNEL);
+
+	if (!work)
+		return -ENOMEM;
+	for_each_online_cpu(cpu) {
+		INIT_WORK(work + cpu, func, info);
+		__queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu),
+				work + cpu);
+	}
+	flush_workqueue(keventd_wq);
+	kfree(work);
+	return 0;
+}
+
 void flush_scheduled_work(void)
 {
 	flush_workqueue(keventd_wq);
-- 
cgit v1.2.2


From 21eac81f252fe31c3cf64b805a1e8652192f3a3b Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 8 Jan 2006 01:00:45 -0800
Subject: [PATCH] Swap Migration V5: LRU operations

This is the start of the `swap migration' patch series.

Swap migration allows the moving of the physical location of pages between
nodes in a numa system while the process is running.  This means that the
virtual addresses that the process sees do not change.  However, the system
rearranges the physical location of those pages.

The main intent of page migration patches here is to reduce the latency of
memory access by moving pages near to the processor where the process
accessing that memory is running.

The patchset allows a process to manually relocate the node on which its
pages are located through the MF_MOVE and MF_MOVE_ALL options while
setting a new memory policy.

The pages of process can also be relocated from another process using the
sys_migrate_pages() function call.  Requires CAP_SYS_ADMIN.  The migrate_pages
function call takes two sets of nodes and moves pages of a process that are
located on the from nodes to the destination nodes.

Manual migration is very useful if for example the scheduler has relocated a
process to a processor on a distant node.  A batch scheduler or an
administrator can detect the situation and move the pages of the process
nearer to the new processor.

sys_migrate_pages() could be used on non-numa machines as well, to force all
of a particualr process's pages out to swap, if someone thinks that's useful.

Larger installations usually partition the system using cpusets into sections
of nodes.  Paul has equipped cpusets with the ability to move pages when a
task is moved to another cpuset.  This allows automatic control over locality
of a process.  If a task is moved to a new cpuset then also all its pages are
moved with it so that the performance of the process does not sink
dramatically (as is the case today).

Swap migration works by simply evicting the page.  The pages must be faulted
back in.  The pages are then typically reallocated by the system near the node
where the process is executing.

For swap migration the destination of the move is controlled by the allocation
policy.  Cpusets set the allocation policy before calling sys_migrate_pages()
in order to move the pages as intended.

No allocation policy changes are performed for sys_migrate_pages().  This
means that the pages may not faulted in to the specified nodes if no
allocation policy was set by other means.  The pages will just end up near the
node where the fault occurred.

There's another patch series in the pipeline which implements "direct
migration".

The direct migration patchset extends the migration functionality to avoid
going through swap.  The destination node of the relation is controllable
during the actual moving of pages.  The crutch of using the allocation policy
to relocate is not necessary and the pages are moved directly to the target.
Its also faster since swap is not used.

And sys_migrate_pages() can then move pages directly to the specified node.
Implement functions to isolate pages from the LRU and put them back later.

This patch:

An earlier implementation was provided by Hirokazu Takahashi
<taka@valinux.co.jp> and IWAMOTO Toshihiro <iwamoto@valinux.co.jp> for the
memory hotplug project.

From: Magnus

This breaks out isolate_lru_page() and putpack_lru_page().  Needed for swap
migration.

Signed-off-by: Magnus Damm <magnus.damm@gmail.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm_inline.h |  22 ++++++++++
 include/linux/swap.h      |   3 ++
 mm/vmscan.c               | 100 ++++++++++++++++++++++++++++++++++++++++------
 3 files changed, 112 insertions(+), 13 deletions(-)

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 47762ca695..49cc68af01 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -38,3 +38,25 @@ del_page_from_lru(struct zone *zone, struct page *page)
 		zone->nr_inactive--;
 	}
 }
+
+/*
+ * Isolate one page from the LRU lists.
+ *
+ * - zone->lru_lock must be held
+ */
+static inline int __isolate_lru_page(struct page *page)
+{
+	if (unlikely(!TestClearPageLRU(page)))
+		return 0;
+
+	if (get_page_testone(page)) {
+		/*
+		 * It is being freed elsewhere
+		 */
+		__put_page(page);
+		SetPageLRU(page);
+		return -ENOENT;
+	}
+
+	return 1;
+}
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 556617bcf7..a49112536c 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -175,6 +175,9 @@ extern int try_to_free_pages(struct zone **, gfp_t);
 extern int shrink_all_memory(int);
 extern int vm_swappiness;
 
+extern int isolate_lru_page(struct page *p);
+extern int putback_lru_pages(struct list_head *l);
+
 #ifdef CONFIG_MMU
 /* linux/mm/shmem.c */
 extern int shmem_unuse(swp_entry_t entry, struct page *page);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 428c5801d4..261a56ee11 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -593,20 +593,18 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
 		page = lru_to_page(src);
 		prefetchw_prev_lru_page(page, src, flags);
 
-		if (!TestClearPageLRU(page))
-			BUG();
-		list_del(&page->lru);
-		if (get_page_testone(page)) {
-			/*
-			 * It is being freed elsewhere
-			 */
-			__put_page(page);
-			SetPageLRU(page);
-			list_add(&page->lru, src);
-			continue;
-		} else {
-			list_add(&page->lru, dst);
+		switch (__isolate_lru_page(page)) {
+		case 1:
+			/* Succeeded to isolate page */
+			list_move(&page->lru, dst);
 			nr_taken++;
+			break;
+		case -ENOENT:
+			/* Not possible to isolate */
+			list_move(&page->lru, src);
+			break;
+		default:
+			BUG();
 		}
 	}
 
@@ -614,6 +612,48 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
 	return nr_taken;
 }
 
+static void lru_add_drain_per_cpu(void *dummy)
+{
+	lru_add_drain();
+}
+
+/*
+ * Isolate one page from the LRU lists and put it on the
+ * indicated list. Do necessary cache draining if the
+ * page is not on the LRU lists yet.
+ *
+ * Result:
+ *  0 = page not on LRU list
+ *  1 = page removed from LRU list and added to the specified list.
+ * -ENOENT = page is being freed elsewhere.
+ */
+int isolate_lru_page(struct page *page)
+{
+	int rc = 0;
+	struct zone *zone = page_zone(page);
+
+redo:
+	spin_lock_irq(&zone->lru_lock);
+	rc = __isolate_lru_page(page);
+	if (rc == 1) {
+		if (PageActive(page))
+			del_page_from_active_list(zone, page);
+		else
+			del_page_from_inactive_list(zone, page);
+	}
+	spin_unlock_irq(&zone->lru_lock);
+	if (rc == 0) {
+		/*
+		 * Maybe this page is still waiting for a cpu to drain it
+		 * from one of the lru lists?
+		 */
+		rc = schedule_on_each_cpu(lru_add_drain_per_cpu, NULL);
+		if (rc == 0 && PageLRU(page))
+			goto redo;
+	}
+	return rc;
+}
+
 /*
  * shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed
  */
@@ -679,6 +719,40 @@ done:
 	pagevec_release(&pvec);
 }
 
+static inline void move_to_lru(struct page *page)
+{
+	list_del(&page->lru);
+	if (PageActive(page)) {
+		/*
+		 * lru_cache_add_active checks that
+		 * the PG_active bit is off.
+		 */
+		ClearPageActive(page);
+		lru_cache_add_active(page);
+	} else {
+		lru_cache_add(page);
+	}
+	put_page(page);
+}
+
+/*
+ * Add isolated pages on the list back to the LRU
+ *
+ * returns the number of pages put back.
+ */
+int putback_lru_pages(struct list_head *l)
+{
+	struct page *page;
+	struct page *page2;
+	int count = 0;
+
+	list_for_each_entry_safe(page, page2, l, lru) {
+		move_to_lru(page);
+		count++;
+	}
+	return count;
+}
+
 /*
  * This moves pages from the active list to the inactive list.
  *
-- 
cgit v1.2.2


From 930d915252edda7042c944ed3c30194a2f9fe163 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 8 Jan 2006 01:00:47 -0800
Subject: [PATCH] Swap Migration V5: PF_SWAPWRITE to allow writing to swap

Add PF_SWAPWRITE to control a processes permission to write to swap.

- Use PF_SWAPWRITE in may_write_to_queue() instead of checking for kswapd
  and pdflush

- Set PF_SWAPWRITE flag for kswapd and pdflush

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h | 1 +
 mm/pdflush.c          | 2 +-
 mm/vmscan.c           | 6 ++----
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7da33619d5..a74662077d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -895,6 +895,7 @@ do { if (atomic_dec_and_test(&(tsk)->usage)) __put_task_struct(tsk); } while(0)
 #define PF_SYNCWRITE	0x00200000	/* I am doing a sync write */
 #define PF_BORROWED_MM	0x00400000	/* I am a kthread doing use_mm */
 #define PF_RANDOMIZE	0x00800000	/* randomize virtual address space */
+#define PF_SWAPWRITE	0x01000000	/* Allowed to write to swap */
 
 /*
  * Only the _current_ task can read/write to tsk->flags, but other
diff --git a/mm/pdflush.c b/mm/pdflush.c
index 52822c98c4..c4b6d0afd7 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -90,7 +90,7 @@ struct pdflush_work {
 
 static int __pdflush(struct pdflush_work *my_work)
 {
-	current->flags |= PF_FLUSHER;
+	current->flags |= PF_FLUSHER | PF_SWAPWRITE;
 	my_work->fn = NULL;
 	my_work->who = current;
 	INIT_LIST_HEAD(&my_work->list);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 261a56ee11..6c30a8c597 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -268,9 +268,7 @@ static inline int is_page_cache_freeable(struct page *page)
 
 static int may_write_to_queue(struct backing_dev_info *bdi)
 {
-	if (current_is_kswapd())
-		return 1;
-	if (current_is_pdflush())	/* This is unlikely, but why not... */
+	if (current->flags & PF_SWAPWRITE)
 		return 1;
 	if (!bdi_write_congested(bdi))
 		return 1;
@@ -1299,7 +1297,7 @@ static int kswapd(void *p)
 	 * us from recursively trying to free more memory as we're
 	 * trying to free the first piece of memory in the first place).
 	 */
-	tsk->flags |= PF_MEMALLOC|PF_KSWAPD;
+	tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
 
 	order = 0;
 	for ( ; ; ) {
-- 
cgit v1.2.2


From 49d2e9cc4544369635cd6f4ef6d5bb0f757079a7 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 8 Jan 2006 01:00:48 -0800
Subject: [PATCH] Swap Migration V5: migrate_pages() function

This adds the basic page migration function with a minimal implementation that
only allows the eviction of pages to swap space.

Page eviction and migration may be useful to migrate pages, to suspend
programs or for remapping single pages (useful for faulty pages or pages with
soft ECC failures)

The process is as follows:

The function wanting to migrate pages must first build a list of pages to be
migrated or evicted and take them off the lru lists via isolate_lru_page().
isolate_lru_page determines that a page is freeable based on the LRU bit set.

Then the actual migration or swapout can happen by calling migrate_pages().

migrate_pages does its best to migrate or swapout the pages and does multiple
passes over the list.  Some pages may only be swappable if they are not dirty.
 migrate_pages may start writing out dirty pages in the initial passes over
the pages.  However, migrate_pages may not be able to migrate or evict all
pages for a variety of reasons.

The remaining pages may be returned to the LRU lists using putback_lru_pages().

Changelog V4->V5:
- Use the lru caches to return pages to the LRU

Changelog V3->V4:
- Restructure code so that applying patches to support full migration does
  require minimal changes. Rename swapout_pages() to migrate_pages().

Changelog V2->V3:
- Extract common code from shrink_list() and swapout_pages()

Signed-off-by: Mike Kravetz <kravetz@us.ibm.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: "Michael Kerrisk" <mtk-manpages@gmx.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/swap.h |   2 +
 mm/vmscan.c          | 214 +++++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 182 insertions(+), 34 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index a49112536c..893096e67b 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -178,6 +178,8 @@ extern int vm_swappiness;
 extern int isolate_lru_page(struct page *p);
 extern int putback_lru_pages(struct list_head *l);
 
+extern int migrate_pages(struct list_head *l, struct list_head *t);
+
 #ifdef CONFIG_MMU
 /* linux/mm/shmem.c */
 extern int shmem_unuse(swp_entry_t entry, struct page *page);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 6c30a8c597..a537a7f163 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -373,6 +373,43 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
 	return PAGE_CLEAN;
 }
 
+static int remove_mapping(struct address_space *mapping, struct page *page)
+{
+	if (!mapping)
+		return 0;		/* truncate got there first */
+
+	write_lock_irq(&mapping->tree_lock);
+
+	/*
+	 * The non-racy check for busy page.  It is critical to check
+	 * PageDirty _after_ making sure that the page is freeable and
+	 * not in use by anybody. 	(pagecache + us == 2)
+	 */
+	if (unlikely(page_count(page) != 2))
+		goto cannot_free;
+	smp_rmb();
+	if (unlikely(PageDirty(page)))
+		goto cannot_free;
+
+	if (PageSwapCache(page)) {
+		swp_entry_t swap = { .val = page_private(page) };
+		__delete_from_swap_cache(page);
+		write_unlock_irq(&mapping->tree_lock);
+		swap_free(swap);
+		__put_page(page);	/* The pagecache ref */
+		return 1;
+	}
+
+	__remove_from_page_cache(page);
+	write_unlock_irq(&mapping->tree_lock);
+	__put_page(page);
+	return 1;
+
+cannot_free:
+	write_unlock_irq(&mapping->tree_lock);
+	return 0;
+}
+
 /*
  * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed
  */
@@ -504,36 +541,8 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
 				goto free_it;
 		}
 
-		if (!mapping)
-			goto keep_locked;	/* truncate got there first */
-
-		write_lock_irq(&mapping->tree_lock);
-
-		/*
-		 * The non-racy check for busy page.  It is critical to check
-		 * PageDirty _after_ making sure that the page is freeable and
-		 * not in use by anybody. 	(pagecache + us == 2)
-		 */
-		if (unlikely(page_count(page) != 2))
-			goto cannot_free;
-		smp_rmb();
-		if (unlikely(PageDirty(page)))
-			goto cannot_free;
-
-#ifdef CONFIG_SWAP
-		if (PageSwapCache(page)) {
-			swp_entry_t swap = { .val = page_private(page) };
-			__delete_from_swap_cache(page);
-			write_unlock_irq(&mapping->tree_lock);
-			swap_free(swap);
-			__put_page(page);	/* The pagecache ref */
-			goto free_it;
-		}
-#endif /* CONFIG_SWAP */
-
-		__remove_from_page_cache(page);
-		write_unlock_irq(&mapping->tree_lock);
-		__put_page(page);
+		if (!remove_mapping(mapping, page))
+			goto keep_locked;
 
 free_it:
 		unlock_page(page);
@@ -542,10 +551,6 @@ free_it:
 			__pagevec_release_nonlru(&freed_pvec);
 		continue;
 
-cannot_free:
-		write_unlock_irq(&mapping->tree_lock);
-		goto keep_locked;
-
 activate_locked:
 		SetPageActive(page);
 		pgactivate++;
@@ -563,6 +568,147 @@ keep:
 	return reclaimed;
 }
 
+/*
+ * swapout a single page
+ * page is locked upon entry, unlocked on exit
+ *
+ * return codes:
+ *	0 = complete
+ *	1 = retry
+ */
+static int swap_page(struct page *page)
+{
+	struct address_space *mapping = page_mapping(page);
+
+	if (page_mapped(page) && mapping)
+		if (try_to_unmap(page) != SWAP_SUCCESS)
+			goto unlock_retry;
+
+	if (PageDirty(page)) {
+		/* Page is dirty, try to write it out here */
+		switch(pageout(page, mapping)) {
+		case PAGE_KEEP:
+		case PAGE_ACTIVATE:
+			goto unlock_retry;
+
+		case PAGE_SUCCESS:
+			goto retry;
+
+		case PAGE_CLEAN:
+			; /* try to free the page below */
+		}
+	}
+
+	if (PagePrivate(page)) {
+		if (!try_to_release_page(page, GFP_KERNEL) ||
+		    (!mapping && page_count(page) == 1))
+			goto unlock_retry;
+	}
+
+	if (remove_mapping(mapping, page)) {
+		/* Success */
+		unlock_page(page);
+		return 0;
+	}
+
+unlock_retry:
+	unlock_page(page);
+
+retry:
+	return 1;
+}
+/*
+ * migrate_pages
+ *
+ * Two lists are passed to this function. The first list
+ * contains the pages isolated from the LRU to be migrated.
+ * The second list contains new pages that the pages isolated
+ * can be moved to. If the second list is NULL then all
+ * pages are swapped out.
+ *
+ * The function returns after 10 attempts or if no pages
+ * are movable anymore because t has become empty
+ * or no retryable pages exist anymore.
+ *
+ * SIMPLIFIED VERSION: This implementation of migrate_pages
+ * is only swapping out pages and never touches the second
+ * list. The direct migration patchset
+ * extends this function to avoid the use of swap.
+ */
+int migrate_pages(struct list_head *l, struct list_head *t)
+{
+	int retry;
+	LIST_HEAD(failed);
+	int nr_failed = 0;
+	int pass = 0;
+	struct page *page;
+	struct page *page2;
+	int swapwrite = current->flags & PF_SWAPWRITE;
+
+	if (!swapwrite)
+		current->flags |= PF_SWAPWRITE;
+
+redo:
+	retry = 0;
+
+	list_for_each_entry_safe(page, page2, l, lru) {
+		cond_resched();
+
+		/*
+		 * Skip locked pages during the first two passes to give the
+		 * functions holding the lock time to release the page. Later we use
+		 * lock_page to have a higher chance of acquiring the lock.
+		 */
+		if (pass > 2)
+			lock_page(page);
+		else
+			if (TestSetPageLocked(page))
+				goto retry_later;
+
+		/*
+		 * Only wait on writeback if we have already done a pass where
+		 * we we may have triggered writeouts for lots of pages.
+		 */
+		if (pass > 0)
+			wait_on_page_writeback(page);
+		else
+			if (PageWriteback(page)) {
+				unlock_page(page);
+				goto retry_later;
+			}
+
+#ifdef CONFIG_SWAP
+		if (PageAnon(page) && !PageSwapCache(page)) {
+			if (!add_to_swap(page)) {
+				unlock_page(page);
+				list_move(&page->lru, &failed);
+				nr_failed++;
+				continue;
+			}
+		}
+#endif /* CONFIG_SWAP */
+
+		/*
+		 * Page is properly locked and writeback is complete.
+		 * Try to migrate the page.
+		 */
+		if (swap_page(page)) {
+retry_later:
+			retry++;
+		}
+	}
+	if (retry && pass++ < 10)
+		goto redo;
+
+	if (!swapwrite)
+		current->flags &= ~PF_SWAPWRITE;
+
+	if (!list_empty(&failed))
+		list_splice(&failed, l);
+
+	return nr_failed + retry;
+}
+
 /*
  * zone->lru_lock is heavily contended.  Some of the functions that
  * shrink the lists perform better by taking out a batch of pages
-- 
cgit v1.2.2


From 7cbe34cf86c673503b177ff47cfa2c7030dabb50 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Sun, 8 Jan 2006 01:00:49 -0800
Subject: [PATCH] Swap Migration V5: Add CONFIG_MIGRATION for page migration
 support

Include page migration if the system is NUMA or having a memory model that
allows distinct areas of memory (SPARSEMEM, DISCONTIGMEM).

And:
- Only include lru_add_drain_per_cpu if building for an SMP system.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/swap.h |  2 ++
 mm/Kconfig           |  7 +++++++
 mm/vmscan.c          | 20 +++++++++++---------
 3 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 893096e67b..117add066f 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -178,7 +178,9 @@ extern int vm_swappiness;
 extern int isolate_lru_page(struct page *p);
 extern int putback_lru_pages(struct list_head *l);
 
+#ifdef CONFIG_MIGRATION
 extern int migrate_pages(struct list_head *l, struct list_head *t);
+#endif
 
 #ifdef CONFIG_MMU
 /* linux/mm/shmem.c */
diff --git a/mm/Kconfig b/mm/Kconfig
index b3db11f137..a9cb80ae64 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -132,3 +132,10 @@ config SPLIT_PTLOCK_CPUS
 	default "4096" if ARM && !CPU_CACHE_VIPT
 	default "4096" if PARISC && !PA20
 	default "4"
+
+#
+# support for page migration
+#
+config MIGRATION
+	def_bool y if NUMA || SPARSEMEM || DISCONTIGMEM
+	depends on SWAP
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a537a7f163..58270aea66 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -568,6 +568,7 @@ keep:
 	return reclaimed;
 }
 
+#ifdef CONFIG_MIGRATION
 /*
  * swapout a single page
  * page is locked upon entry, unlocked on exit
@@ -656,8 +657,9 @@ redo:
 
 		/*
 		 * Skip locked pages during the first two passes to give the
-		 * functions holding the lock time to release the page. Later we use
-		 * lock_page to have a higher chance of acquiring the lock.
+		 * functions holding the lock time to release the page. Later we
+		 * use lock_page() to have a higher chance of acquiring the
+		 * lock.
 		 */
 		if (pass > 2)
 			lock_page(page);
@@ -669,15 +671,15 @@ redo:
 		 * Only wait on writeback if we have already done a pass where
 		 * we we may have triggered writeouts for lots of pages.
 		 */
-		if (pass > 0)
+		if (pass > 0) {
 			wait_on_page_writeback(page);
-		else
+		} else {
 			if (PageWriteback(page)) {
 				unlock_page(page);
 				goto retry_later;
 			}
+		}
 
-#ifdef CONFIG_SWAP
 		if (PageAnon(page) && !PageSwapCache(page)) {
 			if (!add_to_swap(page)) {
 				unlock_page(page);
@@ -686,16 +688,15 @@ redo:
 				continue;
 			}
 		}
-#endif /* CONFIG_SWAP */
 
 		/*
 		 * Page is properly locked and writeback is complete.
 		 * Try to migrate the page.
 		 */
-		if (swap_page(page)) {
+		if (!swap_page(page))
+			continue;
 retry_later:
-			retry++;
-		}
+		retry++;
 	}
 	if (retry && pass++ < 10)
 		goto redo;
@@ -708,6 +709,7 @@ retry_later:
 
 	return nr_failed + retry;
 }
+#endif
 
 /*
  * zone->lru_lock is heavily contended.  Some of the functions that
-- 
cgit v1.2.2


From dc9aa5b9d65fd11b1f5246b46ec610ee8b83c6dd Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 8 Jan 2006 01:00:50 -0800
Subject: [PATCH] Swap Migration V5: MPOL_MF_MOVE interface

Add page migration support via swap to the NUMA policy layer

This patch adds page migration support to the NUMA policy layer.  An
additional flag MPOL_MF_MOVE is introduced for mbind.  If MPOL_MF_MOVE is
specified then pages that do not conform to the memory policy will be evicted
from memory.  When they get pages back in new pages will be allocated
following the numa policy.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mempolicy.h |   3 +
 mm/mempolicy.c            | 155 ++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 138 insertions(+), 20 deletions(-)

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index ed00b278cb..05443a766c 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -22,6 +22,9 @@
 
 /* Flags for mbind */
 #define MPOL_MF_STRICT	(1<<0)	/* Verify existing pages in the mapping */
+#define MPOL_MF_MOVE	(1<<1)	/* Move pages owned by this process to conform to mapping */
+#define MPOL_MF_MOVE_ALL (1<<2)	/* Move every page to conform to mapping */
+#define MPOL_MF_INTERNAL (1<<3)	/* Internal flags start here */
 
 #ifdef __KERNEL__
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 0f1d2b8a95..9cc6d96283 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -83,9 +83,14 @@
 #include <linux/init.h>
 #include <linux/compat.h>
 #include <linux/mempolicy.h>
+#include <linux/swap.h>
+
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
 
+/* Internal MPOL_MF_xxx flags */
+#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
+
 static kmem_cache_t *policy_cache;
 static kmem_cache_t *sn_cache;
 
@@ -174,9 +179,59 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 	return policy;
 }
 
+/* Check if we are the only process mapping the page in question */
+static inline int single_mm_mapping(struct mm_struct *mm,
+			struct address_space *mapping)
+{
+	struct vm_area_struct *vma;
+	struct prio_tree_iter iter;
+	int rc = 1;
+
+	spin_lock(&mapping->i_mmap_lock);
+	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
+		if (mm != vma->vm_mm) {
+			rc = 0;
+			goto out;
+		}
+	list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
+		if (mm != vma->vm_mm) {
+			rc = 0;
+			goto out;
+		}
+out:
+	spin_unlock(&mapping->i_mmap_lock);
+	return rc;
+}
+
+/*
+ * Add a page to be migrated to the pagelist
+ */
+static void migrate_page_add(struct vm_area_struct *vma,
+	struct page *page, struct list_head *pagelist, unsigned long flags)
+{
+	/*
+	 * Avoid migrating a page that is shared by others and not writable.
+	 */
+	if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
+	    mapping_writably_mapped(page->mapping) ||
+	    single_mm_mapping(vma->vm_mm, page->mapping)) {
+		int rc = isolate_lru_page(page);
+
+		if (rc == 1)
+			list_add(&page->lru, pagelist);
+		/*
+		 * If the isolate attempt was not successful then we just
+		 * encountered an unswappable page. Something must be wrong.
+	 	 */
+		WARN_ON(rc == 0);
+	}
+}
+
 /* Ensure all existing pages follow the policy. */
 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
-		unsigned long addr, unsigned long end, nodemask_t *nodes)
+		unsigned long addr, unsigned long end,
+		const nodemask_t *nodes, unsigned long flags,
+		struct list_head *pagelist)
 {
 	pte_t *orig_pte;
 	pte_t *pte;
@@ -193,15 +248,21 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 		if (!page)
 			continue;
 		nid = page_to_nid(page);
-		if (!node_isset(nid, *nodes))
-			break;
+		if (!node_isset(nid, *nodes)) {
+			if (pagelist)
+				migrate_page_add(vma, page, pagelist, flags);
+			else
+				break;
+		}
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 	pte_unmap_unlock(orig_pte, ptl);
 	return addr != end;
 }
 
 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
-		unsigned long addr, unsigned long end, nodemask_t *nodes)
+		unsigned long addr, unsigned long end,
+		const nodemask_t *nodes, unsigned long flags,
+		struct list_head *pagelist)
 {
 	pmd_t *pmd;
 	unsigned long next;
@@ -211,14 +272,17 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 		next = pmd_addr_end(addr, end);
 		if (pmd_none_or_clear_bad(pmd))
 			continue;
-		if (check_pte_range(vma, pmd, addr, next, nodes))
+		if (check_pte_range(vma, pmd, addr, next, nodes,
+				    flags, pagelist))
 			return -EIO;
 	} while (pmd++, addr = next, addr != end);
 	return 0;
 }
 
 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
-		unsigned long addr, unsigned long end, nodemask_t *nodes)
+		unsigned long addr, unsigned long end,
+		const nodemask_t *nodes, unsigned long flags,
+		struct list_head *pagelist)
 {
 	pud_t *pud;
 	unsigned long next;
@@ -228,14 +292,17 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 		next = pud_addr_end(addr, end);
 		if (pud_none_or_clear_bad(pud))
 			continue;
-		if (check_pmd_range(vma, pud, addr, next, nodes))
+		if (check_pmd_range(vma, pud, addr, next, nodes,
+				    flags, pagelist))
 			return -EIO;
 	} while (pud++, addr = next, addr != end);
 	return 0;
 }
 
 static inline int check_pgd_range(struct vm_area_struct *vma,
-		unsigned long addr, unsigned long end, nodemask_t *nodes)
+		unsigned long addr, unsigned long end,
+		const nodemask_t *nodes, unsigned long flags,
+		struct list_head *pagelist)
 {
 	pgd_t *pgd;
 	unsigned long next;
@@ -245,16 +312,31 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
 			continue;
-		if (check_pud_range(vma, pgd, addr, next, nodes))
+		if (check_pud_range(vma, pgd, addr, next, nodes,
+				    flags, pagelist))
 			return -EIO;
 	} while (pgd++, addr = next, addr != end);
 	return 0;
 }
 
-/* Step 1: check the range */
+/* Check if a vma is migratable */
+static inline int vma_migratable(struct vm_area_struct *vma)
+{
+	if (vma->vm_flags & (
+		VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP))
+		return 0;
+	return 1;
+}
+
+/*
+ * Check if all pages in a range are on a set of nodes.
+ * If pagelist != NULL then isolate pages from the LRU and
+ * put them on the pagelist.
+ */
 static struct vm_area_struct *
 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
-	    nodemask_t *nodes, unsigned long flags)
+		const nodemask_t *nodes, unsigned long flags,
+		struct list_head *pagelist)
 {
 	int err;
 	struct vm_area_struct *first, *vma, *prev;
@@ -264,17 +346,24 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 		return ERR_PTR(-EFAULT);
 	prev = NULL;
 	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
-		if (!vma->vm_next && vma->vm_end < end)
-			return ERR_PTR(-EFAULT);
-		if (prev && prev->vm_end < vma->vm_start)
-			return ERR_PTR(-EFAULT);
-		if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
+		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
+			if (!vma->vm_next && vma->vm_end < end)
+				return ERR_PTR(-EFAULT);
+			if (prev && prev->vm_end < vma->vm_start)
+				return ERR_PTR(-EFAULT);
+		}
+		if (!is_vm_hugetlb_page(vma) &&
+		    ((flags & MPOL_MF_STRICT) ||
+		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
+				vma_migratable(vma)))) {
 			unsigned long endvma = vma->vm_end;
+
 			if (endvma > end)
 				endvma = end;
 			if (vma->vm_start > start)
 				start = vma->vm_start;
-			err = check_pgd_range(vma, start, endvma, nodes);
+			err = check_pgd_range(vma, start, endvma, nodes,
+						flags, pagelist);
 			if (err) {
 				first = ERR_PTR(err);
 				break;
@@ -348,33 +437,59 @@ long do_mbind(unsigned long start, unsigned long len,
 	struct mempolicy *new;
 	unsigned long end;
 	int err;
+	LIST_HEAD(pagelist);
 
-	if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
+	if ((flags & ~(unsigned long)(MPOL_MF_STRICT|MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
+	    || mode > MPOL_MAX)
 		return -EINVAL;
+	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
+		return -EPERM;
+
 	if (start & ~PAGE_MASK)
 		return -EINVAL;
+
 	if (mode == MPOL_DEFAULT)
 		flags &= ~MPOL_MF_STRICT;
+
 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 	end = start + len;
+
 	if (end < start)
 		return -EINVAL;
 	if (end == start)
 		return 0;
+
 	if (mpol_check_policy(mode, nmask))
 		return -EINVAL;
+
 	new = mpol_new(mode, nmask);
 	if (IS_ERR(new))
 		return PTR_ERR(new);
 
+	/*
+	 * If we are using the default policy then operation
+	 * on discontinuous address spaces is okay after all
+	 */
+	if (!new)
+		flags |= MPOL_MF_DISCONTIG_OK;
+
 	PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
 			mode,nodes_addr(nodes)[0]);
 
 	down_write(&mm->mmap_sem);
-	vma = check_range(mm, start, end, nmask, flags);
+	vma = check_range(mm, start, end, nmask, flags,
+	      (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ? &pagelist : NULL);
 	err = PTR_ERR(vma);
-	if (!IS_ERR(vma))
+	if (!IS_ERR(vma)) {
 		err = mbind_range(vma, start, end, new);
+		if (!list_empty(&pagelist))
+			migrate_pages(&pagelist, NULL);
+		if (!err && !list_empty(&pagelist) && (flags & MPOL_MF_STRICT))
+			err = -EIO;
+	}
+	if (!list_empty(&pagelist))
+		putback_lru_pages(&pagelist);
+
 	up_write(&mm->mmap_sem);
 	mpol_free(new);
 	return err;
-- 
cgit v1.2.2


From 39743889aaf76725152f16aa90ca3c45f6d52da3 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 8 Jan 2006 01:00:51 -0800
Subject: [PATCH] Swap Migration V5: sys_migrate_pages interface

sys_migrate_pages implementation using swap based page migration

This is the original API proposed by Ray Bryant in his posts during the first
half of 2005 on linux-mm@kvack.org and linux-kernel@vger.kernel.org.

The intent of sys_migrate is to migrate memory of a process.  A process may
have migrated to another node.  Memory was allocated optimally for the prior
context.  sys_migrate_pages allows to shift the memory to the new node.

sys_migrate_pages is also useful if the processes available memory nodes have
changed through cpuset operations to manually move the processes memory.  Paul
Jackson is working on an automated mechanism that will allow an automatic
migration if the cpuset of a process is changed.  However, a user may decide
to manually control the migration.

This implementation is put into the policy layer since it uses concepts and
functions that are also needed for mbind and friends.  The patch also provides
a do_migrate_pages function that may be useful for cpusets to automatically
move memory.  sys_migrate_pages does not modify policies in contrast to Ray's
implementation.

The current code here is based on the swap based page migration capability and
thus is not able to preserve the physical layout relative to it containing
nodeset (which may be a cpuset).  When direct page migration becomes available
then the implementation needs to be changed to do a isomorphic move of pages
between different nodesets.  The current implementation simply evicts all
pages in source nodeset that are not in the target nodeset.

Patch supports ia64, i386 and x86_64.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/syscall_table.S |  1 +
 arch/ia64/kernel/entry.S         |  1 +
 arch/x86_64/ia32/ia32entry.S     |  1 +
 include/asm-i386/unistd.h        |  3 +-
 include/asm-ia64/unistd.h        |  3 +-
 include/asm-x86_64/ia32_unistd.h |  3 +-
 include/asm-x86_64/unistd.h      |  4 +-
 include/linux/mempolicy.h        |  3 ++
 include/linux/syscalls.h         |  2 +
 kernel/sys_ni.c                  |  1 +
 mm/mempolicy.c                   | 94 +++++++++++++++++++++++++++++++++++++++-
 11 files changed, 111 insertions(+), 5 deletions(-)

diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index f7ba4acc20..6ff3e52432 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -293,3 +293,4 @@ ENTRY(sys_call_table)
 	.long sys_inotify_init
 	.long sys_inotify_add_watch
 	.long sys_inotify_rm_watch
+	.long sys_migrate_pages
diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S
index 0741b066b9..7a6ffd6137 100644
--- a/arch/ia64/kernel/entry.S
+++ b/arch/ia64/kernel/entry.S
@@ -1600,5 +1600,6 @@ sys_call_table:
 	data8 sys_inotify_init
 	data8 sys_inotify_add_watch
 	data8 sys_inotify_rm_watch
+	data8 sys_migrate_pages			// 1280
 
 	.org sys_call_table + 8*NR_syscalls	// guard against failures to increase NR_syscalls
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index df0773c9bd..1f0ff5adc8 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -643,6 +643,7 @@ ia32_sys_call_table:
 	.quad sys_inotify_init
 	.quad sys_inotify_add_watch
 	.quad sys_inotify_rm_watch
+	.quad sys_migrate_pages
 ia32_syscall_end:		
 	.rept IA32_NR_syscalls-(ia32_syscall_end-ia32_sys_call_table)/8
 		.quad ni_syscall
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index fe38b9a962..481c3c0ea7 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -299,8 +299,9 @@
 #define __NR_inotify_init	291
 #define __NR_inotify_add_watch	292
 #define __NR_inotify_rm_watch	293
+#define __NR_migrate_pages	294
 
-#define NR_syscalls 294
+#define NR_syscalls 295
 
 /*
  * user-visible error numbers are in the range -1 - -128: see
diff --git a/include/asm-ia64/unistd.h b/include/asm-ia64/unistd.h
index 2bf543493c..962f9bd1bd 100644
--- a/include/asm-ia64/unistd.h
+++ b/include/asm-ia64/unistd.h
@@ -269,12 +269,13 @@
 #define __NR_inotify_init		1277
 #define __NR_inotify_add_watch		1278
 #define __NR_inotify_rm_watch		1279
+#define __NR_migrate_pages		1280
 
 #ifdef __KERNEL__
 
 #include <linux/config.h>
 
-#define NR_syscalls			256 /* length of syscall table */
+#define NR_syscalls			270 /* length of syscall table */
 
 #define __ARCH_WANT_SYS_RT_SIGACTION
 
diff --git a/include/asm-x86_64/ia32_unistd.h b/include/asm-x86_64/ia32_unistd.h
index d5166ec386..e8843362a6 100644
--- a/include/asm-x86_64/ia32_unistd.h
+++ b/include/asm-x86_64/ia32_unistd.h
@@ -299,7 +299,8 @@
 #define __NR_ia32_inotify_init		291
 #define __NR_ia32_inotify_add_watch	292
 #define __NR_ia32_inotify_rm_watch	293
+#define __NR_ia32_migrate_pages		294
 
-#define IA32_NR_syscalls 294	/* must be > than biggest syscall! */
+#define IA32_NR_syscalls 295	/* must be > than biggest syscall! */
 
 #endif /* _ASM_X86_64_IA32_UNISTD_H_ */
diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index 2c42150bce..e6f896161c 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -571,8 +571,10 @@ __SYSCALL(__NR_inotify_init, sys_inotify_init)
 __SYSCALL(__NR_inotify_add_watch, sys_inotify_add_watch)
 #define __NR_inotify_rm_watch	255
 __SYSCALL(__NR_inotify_rm_watch, sys_inotify_rm_watch)
+#define __NR_migrate_pages	256
+__SYSCALL(__NR_migrate_pages, sys_migrate_pages)
 
-#define __NR_syscall_max __NR_inotify_rm_watch
+#define __NR_syscall_max __NR_migrate_pages
 #ifndef __NO_STUBS
 
 /* user-visible error numbers are in the range -1 - -4095 */
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 05443a766c..3e61e82968 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -162,6 +162,9 @@ static inline void check_highest_zone(int k)
 		policy_zone = k;
 }
 
+int do_migrate_pages(struct mm_struct *mm,
+	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags);
+
 #else
 
 struct mempolicy {};
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index c7007b1db9..e910d1a481 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -511,5 +511,7 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio);
 asmlinkage long sys_ioprio_get(int which, int who);
 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 					unsigned long maxnode);
+asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
+			const unsigned long __user *from, const unsigned long __user *to);
 
 #endif
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 1ab2370e2e..7a8bc7f60d 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -82,6 +82,7 @@ cond_syscall(compat_sys_socketcall);
 cond_syscall(sys_inotify_init);
 cond_syscall(sys_inotify_add_watch);
 cond_syscall(sys_inotify_rm_watch);
+cond_syscall(sys_migrate_pages);
 
 /* arch-specific weak syscall entries */
 cond_syscall(sys_pciconfig_read);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 9cc6d96283..20d5ad39fa 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -614,12 +614,42 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
 	return err;
 }
 
+/*
+ * For now migrate_pages simply swaps out the pages from nodes that are in
+ * the source set but not in the target set. In the future, we would
+ * want a function that moves pages between the two nodesets in such
+ * a way as to preserve the physical layout as much as possible.
+ *
+ * Returns the number of page that could not be moved.
+ */
+int do_migrate_pages(struct mm_struct *mm,
+	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
+{
+	LIST_HEAD(pagelist);
+	int count = 0;
+	nodemask_t nodes;
+
+	nodes_andnot(nodes, *from_nodes, *to_nodes);
+	nodes_complement(nodes, nodes);
+
+	down_read(&mm->mmap_sem);
+	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
+			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
+	if (!list_empty(&pagelist)) {
+		migrate_pages(&pagelist, NULL);
+		if (!list_empty(&pagelist))
+			count = putback_lru_pages(&pagelist);
+	}
+	up_read(&mm->mmap_sem);
+	return count;
+}
+
 /*
  * User space interface with variable sized bitmaps for nodelists.
  */
 
 /* Copy a node mask from user space. */
-static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
+static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 		     unsigned long maxnode)
 {
 	unsigned long k;
@@ -708,6 +738,68 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 	return do_set_mempolicy(mode, &nodes);
 }
 
+/* Macro needed until Paul implements this function in kernel/cpusets.c */
+#define cpuset_mems_allowed(task) node_online_map
+
+asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
+		const unsigned long __user *old_nodes,
+		const unsigned long __user *new_nodes)
+{
+	struct mm_struct *mm;
+	struct task_struct *task;
+	nodemask_t old;
+	nodemask_t new;
+	nodemask_t task_nodes;
+	int err;
+
+	err = get_nodes(&old, old_nodes, maxnode);
+	if (err)
+		return err;
+
+	err = get_nodes(&new, new_nodes, maxnode);
+	if (err)
+		return err;
+
+	/* Find the mm_struct */
+	read_lock(&tasklist_lock);
+	task = pid ? find_task_by_pid(pid) : current;
+	if (!task) {
+		read_unlock(&tasklist_lock);
+		return -ESRCH;
+	}
+	mm = get_task_mm(task);
+	read_unlock(&tasklist_lock);
+
+	if (!mm)
+		return -EINVAL;
+
+	/*
+	 * Check if this process has the right to modify the specified
+	 * process. The right exists if the process has administrative
+	 * capabilities, superuser priviledges or the same
+	 * userid as the target process.
+	 */
+	if ((current->euid != task->suid) && (current->euid != task->uid) &&
+	    (current->uid != task->suid) && (current->uid != task->uid) &&
+	    !capable(CAP_SYS_ADMIN)) {
+		err = -EPERM;
+		goto out;
+	}
+
+	task_nodes = cpuset_mems_allowed(task);
+	/* Is the user allowed to access the target nodes? */
+	if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) {
+		err = -EPERM;
+		goto out;
+	}
+
+	err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE);
+out:
+	mmput(mm);
+	return err;
+}
+
+
 /* Retrieve NUMA policy */
 asmlinkage long sys_get_mempolicy(int __user *policy,
 				unsigned long __user *nmask,
-- 
cgit v1.2.2


From 8419c3181086c86664e8246bc997afc2e4ffba4f Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 8 Jan 2006 01:00:52 -0800
Subject: [PATCH] SwapMig: CONFIG_MIGRATION fixes

Move move_to_lru, putback_lru_pages and isolate_lru in section surrounded by
CONFIG_MIGRATION saving some codesize for single processor kernels.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/swap.h |   3 +-
 mm/vmscan.c          | 152 +++++++++++++++++++++++++--------------------------
 2 files changed, 77 insertions(+), 78 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 117add066f..997d838f0e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -175,10 +175,9 @@ extern int try_to_free_pages(struct zone **, gfp_t);
 extern int shrink_all_memory(int);
 extern int vm_swappiness;
 
+#ifdef CONFIG_MIGRATION
 extern int isolate_lru_page(struct page *p);
 extern int putback_lru_pages(struct list_head *l);
-
-#ifdef CONFIG_MIGRATION
 extern int migrate_pages(struct list_head *l, struct list_head *t);
 #endif
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 58270aea66..daed4a73b7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -569,6 +569,40 @@ keep:
 }
 
 #ifdef CONFIG_MIGRATION
+static inline void move_to_lru(struct page *page)
+{
+	list_del(&page->lru);
+	if (PageActive(page)) {
+		/*
+		 * lru_cache_add_active checks that
+		 * the PG_active bit is off.
+		 */
+		ClearPageActive(page);
+		lru_cache_add_active(page);
+	} else {
+		lru_cache_add(page);
+	}
+	put_page(page);
+}
+
+/*
+ * Add isolated pages on the list back to the LRU
+ *
+ * returns the number of pages put back.
+ */
+int putback_lru_pages(struct list_head *l)
+{
+	struct page *page;
+	struct page *page2;
+	int count = 0;
+
+	list_for_each_entry_safe(page, page2, l, lru) {
+		move_to_lru(page);
+		count++;
+	}
+	return count;
+}
+
 /*
  * swapout a single page
  * page is locked upon entry, unlocked on exit
@@ -709,6 +743,48 @@ retry_later:
 
 	return nr_failed + retry;
 }
+
+static void lru_add_drain_per_cpu(void *dummy)
+{
+	lru_add_drain();
+}
+
+/*
+ * Isolate one page from the LRU lists and put it on the
+ * indicated list. Do necessary cache draining if the
+ * page is not on the LRU lists yet.
+ *
+ * Result:
+ *  0 = page not on LRU list
+ *  1 = page removed from LRU list and added to the specified list.
+ * -ENOENT = page is being freed elsewhere.
+ */
+int isolate_lru_page(struct page *page)
+{
+	int rc = 0;
+	struct zone *zone = page_zone(page);
+
+redo:
+	spin_lock_irq(&zone->lru_lock);
+	rc = __isolate_lru_page(page);
+	if (rc == 1) {
+		if (PageActive(page))
+			del_page_from_active_list(zone, page);
+		else
+			del_page_from_inactive_list(zone, page);
+	}
+	spin_unlock_irq(&zone->lru_lock);
+	if (rc == 0) {
+		/*
+		 * Maybe this page is still waiting for a cpu to drain it
+		 * from one of the lru lists?
+		 */
+		rc = schedule_on_each_cpu(lru_add_drain_per_cpu, NULL);
+		if (rc == 0 && PageLRU(page))
+			goto redo;
+	}
+	return rc;
+}
 #endif
 
 /*
@@ -758,48 +834,6 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
 	return nr_taken;
 }
 
-static void lru_add_drain_per_cpu(void *dummy)
-{
-	lru_add_drain();
-}
-
-/*
- * Isolate one page from the LRU lists and put it on the
- * indicated list. Do necessary cache draining if the
- * page is not on the LRU lists yet.
- *
- * Result:
- *  0 = page not on LRU list
- *  1 = page removed from LRU list and added to the specified list.
- * -ENOENT = page is being freed elsewhere.
- */
-int isolate_lru_page(struct page *page)
-{
-	int rc = 0;
-	struct zone *zone = page_zone(page);
-
-redo:
-	spin_lock_irq(&zone->lru_lock);
-	rc = __isolate_lru_page(page);
-	if (rc == 1) {
-		if (PageActive(page))
-			del_page_from_active_list(zone, page);
-		else
-			del_page_from_inactive_list(zone, page);
-	}
-	spin_unlock_irq(&zone->lru_lock);
-	if (rc == 0) {
-		/*
-		 * Maybe this page is still waiting for a cpu to drain it
-		 * from one of the lru lists?
-		 */
-		rc = schedule_on_each_cpu(lru_add_drain_per_cpu, NULL);
-		if (rc == 0 && PageLRU(page))
-			goto redo;
-	}
-	return rc;
-}
-
 /*
  * shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed
  */
@@ -865,40 +899,6 @@ done:
 	pagevec_release(&pvec);
 }
 
-static inline void move_to_lru(struct page *page)
-{
-	list_del(&page->lru);
-	if (PageActive(page)) {
-		/*
-		 * lru_cache_add_active checks that
-		 * the PG_active bit is off.
-		 */
-		ClearPageActive(page);
-		lru_cache_add_active(page);
-	} else {
-		lru_cache_add(page);
-	}
-	put_page(page);
-}
-
-/*
- * Add isolated pages on the list back to the LRU
- *
- * returns the number of pages put back.
- */
-int putback_lru_pages(struct list_head *l)
-{
-	struct page *page;
-	struct page *page2;
-	int count = 0;
-
-	list_for_each_entry_safe(page, page2, l, lru) {
-		move_to_lru(page);
-		count++;
-	}
-	return count;
-}
-
 /*
  * This moves pages from the active list to the inactive list.
  *
-- 
cgit v1.2.2


From 1480a540c98525640174a7eadd712378fcd6fd63 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 8 Jan 2006 01:00:53 -0800
Subject: [PATCH] SwapMig: add_to_swap() avoid atomic allocations

Add gfp_mask to add_to_swap

add_to_swap does allocations with GFP_ATOMIC in order not to interfere with
swapping.  During migration we may have use add_to_swap extensively which may
lead to out of memory errors.

This patch makes add_to_swap take a parameter that specifies the gfp mask.
The page migration code can then make add_to_swap use GFP_KERNEL.

Signed-off-by: Hirokazu Takahashi <taka@valinux.co.jp>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/swap.h | 2 +-
 mm/swap_state.c      | 4 ++--
 mm/vmscan.c          | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 997d838f0e..eb591eaad1 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -198,7 +198,7 @@ extern int rw_swap_page_sync(int, swp_entry_t, struct page *);
 extern struct address_space swapper_space;
 #define total_swapcache_pages  swapper_space.nrpages
 extern void show_swap_cache_info(void);
-extern int add_to_swap(struct page *);
+extern int add_to_swap(struct page *, gfp_t);
 extern void __delete_from_swap_cache(struct page *);
 extern void delete_from_swap_cache(struct page *);
 extern int move_to_swap_cache(struct page *, swp_entry_t);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index fc2aecb70a..7b09ac503f 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -141,7 +141,7 @@ void __delete_from_swap_cache(struct page *page)
  * Allocate swap space for the page and add the page to the
  * swap cache.  Caller needs to hold the page lock. 
  */
-int add_to_swap(struct page * page)
+int add_to_swap(struct page * page, gfp_t gfp_mask)
 {
 	swp_entry_t entry;
 	int err;
@@ -166,7 +166,7 @@ int add_to_swap(struct page * page)
 		 * Add it to the swap cache and mark it dirty
 		 */
 		err = __add_to_swap_cache(page, entry,
-				GFP_ATOMIC|__GFP_NOMEMALLOC|__GFP_NOWARN);
+				gfp_mask|__GFP_NOMEMALLOC|__GFP_NOWARN);
 
 		switch (err) {
 		case 0:				/* Success */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index daed4a73b7..5393b093a8 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -458,7 +458,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
 		 * Try to allocate it some swap space here.
 		 */
 		if (PageAnon(page) && !PageSwapCache(page)) {
-			if (!add_to_swap(page))
+			if (!add_to_swap(page, GFP_ATOMIC))
 				goto activate_locked;
 		}
 #endif /* CONFIG_SWAP */
@@ -715,7 +715,7 @@ redo:
 		}
 
 		if (PageAnon(page) && !PageSwapCache(page)) {
-			if (!add_to_swap(page)) {
+			if (!add_to_swap(page, GFP_KERNEL)) {
 				unlock_page(page);
 				list_move(&page->lru, &failed);
 				nr_failed++;
-- 
cgit v1.2.2


From ee27497df36823f2793212cad0997c044eb0e1eb Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 8 Jan 2006 01:00:54 -0800
Subject: [PATCH] SwapMig: Drop unused pages immediately

Drop unused pages immediately

If a page is encountered that is only referenced by the migration code then
there is no reason to swap or migrate the page.  Release the page by calling
move_to_lru().

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5393b093a8..73ba4046ed 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -689,6 +689,11 @@ redo:
 	list_for_each_entry_safe(page, page2, l, lru) {
 		cond_resched();
 
+		if (page_count(page) == 1) {
+			/* page was freed from under us. So we are done. */
+			move_to_lru(page);
+			continue;
+		}
 		/*
 		 * Skip locked pages during the first two passes to give the
 		 * functions holding the lock time to release the page. Later we
-- 
cgit v1.2.2


From d498471133ff1f9586a06820beaeebc575fe2814 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 8 Jan 2006 01:00:55 -0800
Subject: [PATCH] SwapMig: Extend parameters for migrate_pages()

Extend the parameters of migrate_pages() to allow the caller control over the
fate of successfully migrated or impossible to migrate pages.

Swap migration and direct migration will have the same interface after this
patch so that patches can be independently applied to the policy layer and the
core migration code.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/swap.h |  3 ++-
 mm/mempolicy.c       | 27 ++++++++++++++++++++++-----
 mm/vmscan.c          | 17 ++++++++---------
 3 files changed, 32 insertions(+), 15 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index eb591eaad1..389d1c382e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -178,7 +178,8 @@ extern int vm_swappiness;
 #ifdef CONFIG_MIGRATION
 extern int isolate_lru_page(struct page *p);
 extern int putback_lru_pages(struct list_head *l);
-extern int migrate_pages(struct list_head *l, struct list_head *t);
+extern int migrate_pages(struct list_head *l, struct list_head *t,
+		struct list_head *moved, struct list_head *failed);
 #endif
 
 #ifdef CONFIG_MMU
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 20d5ad39fa..30bdafba52 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -429,6 +429,19 @@ static int contextualize_policy(int mode, nodemask_t *nodes)
 	return mpol_check_policy(mode, nodes);
 }
 
+static int swap_pages(struct list_head *pagelist)
+{
+	LIST_HEAD(moved);
+	LIST_HEAD(failed);
+	int n;
+
+	n = migrate_pages(pagelist, NULL, &moved, &failed);
+	putback_lru_pages(&failed);
+	putback_lru_pages(&moved);
+
+	return n;
+}
+
 long do_mbind(unsigned long start, unsigned long len,
 		unsigned long mode, nodemask_t *nmask, unsigned long flags)
 {
@@ -481,10 +494,13 @@ long do_mbind(unsigned long start, unsigned long len,
 	      (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ? &pagelist : NULL);
 	err = PTR_ERR(vma);
 	if (!IS_ERR(vma)) {
+		int nr_failed = 0;
+
 		err = mbind_range(vma, start, end, new);
 		if (!list_empty(&pagelist))
-			migrate_pages(&pagelist, NULL);
-		if (!err && !list_empty(&pagelist) && (flags & MPOL_MF_STRICT))
+			nr_failed = swap_pages(&pagelist);
+
+		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 			err = -EIO;
 	}
 	if (!list_empty(&pagelist))
@@ -635,11 +651,12 @@ int do_migrate_pages(struct mm_struct *mm,
 	down_read(&mm->mmap_sem);
 	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
+
 	if (!list_empty(&pagelist)) {
-		migrate_pages(&pagelist, NULL);
-		if (!list_empty(&pagelist))
-			count = putback_lru_pages(&pagelist);
+		count = swap_pages(&pagelist);
+		putback_lru_pages(&pagelist);
 	}
+
 	up_read(&mm->mmap_sem);
 	return count;
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 73ba4046ed..5eecb514cc 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -670,10 +670,10 @@ retry:
  * list. The direct migration patchset
  * extends this function to avoid the use of swap.
  */
-int migrate_pages(struct list_head *l, struct list_head *t)
+int migrate_pages(struct list_head *from, struct list_head *to,
+		  struct list_head *moved, struct list_head *failed)
 {
 	int retry;
-	LIST_HEAD(failed);
 	int nr_failed = 0;
 	int pass = 0;
 	struct page *page;
@@ -686,12 +686,12 @@ int migrate_pages(struct list_head *l, struct list_head *t)
 redo:
 	retry = 0;
 
-	list_for_each_entry_safe(page, page2, l, lru) {
+	list_for_each_entry_safe(page, page2, from, lru) {
 		cond_resched();
 
 		if (page_count(page) == 1) {
 			/* page was freed from under us. So we are done. */
-			move_to_lru(page);
+			list_move(&page->lru, moved);
 			continue;
 		}
 		/*
@@ -722,7 +722,7 @@ redo:
 		if (PageAnon(page) && !PageSwapCache(page)) {
 			if (!add_to_swap(page, GFP_KERNEL)) {
 				unlock_page(page);
-				list_move(&page->lru, &failed);
+				list_move(&page->lru, failed);
 				nr_failed++;
 				continue;
 			}
@@ -732,8 +732,10 @@ redo:
 		 * Page is properly locked and writeback is complete.
 		 * Try to migrate the page.
 		 */
-		if (!swap_page(page))
+		if (!swap_page(page)) {
+			list_move(&page->lru, moved);
 			continue;
+		}
 retry_later:
 		retry++;
 	}
@@ -743,9 +745,6 @@ retry_later:
 	if (!swapwrite)
 		current->flags &= ~PF_SWAPWRITE;
 
-	if (!list_empty(&failed))
-		list_splice(&failed, l);
-
 	return nr_failed + retry;
 }
 
-- 
cgit v1.2.2


From d0d963281ccb22e6f339bfdd75c6b2e31351929f Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Sun, 8 Jan 2006 01:00:55 -0800
Subject: [PATCH] SwapMig: Switch error handling in migrate_pages to use -Exx

Use -Exxx instead of numeric return codes and cleanup the code in
migrate_pages() using -Exx error codes.

Consolidate successful migration handling

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 56 ++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 34 insertions(+), 22 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5eecb514cc..bf903b2d19 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -606,10 +606,6 @@ int putback_lru_pages(struct list_head *l)
 /*
  * swapout a single page
  * page is locked upon entry, unlocked on exit
- *
- * return codes:
- *	0 = complete
- *	1 = retry
  */
 static int swap_page(struct page *page)
 {
@@ -650,7 +646,7 @@ unlock_retry:
 	unlock_page(page);
 
 retry:
-	return 1;
+	return -EAGAIN;
 }
 /*
  * migrate_pages
@@ -669,6 +665,8 @@ retry:
  * is only swapping out pages and never touches the second
  * list. The direct migration patchset
  * extends this function to avoid the use of swap.
+ *
+ * Return: Number of pages not migrated when "to" ran empty.
  */
 int migrate_pages(struct list_head *from, struct list_head *to,
 		  struct list_head *moved, struct list_head *failed)
@@ -679,6 +677,7 @@ int migrate_pages(struct list_head *from, struct list_head *to,
 	struct page *page;
 	struct page *page2;
 	int swapwrite = current->flags & PF_SWAPWRITE;
+	int rc;
 
 	if (!swapwrite)
 		current->flags |= PF_SWAPWRITE;
@@ -689,22 +688,23 @@ redo:
 	list_for_each_entry_safe(page, page2, from, lru) {
 		cond_resched();
 
-		if (page_count(page) == 1) {
+		rc = 0;
+		if (page_count(page) == 1)
 			/* page was freed from under us. So we are done. */
-			list_move(&page->lru, moved);
-			continue;
-		}
+			goto next;
+
 		/*
 		 * Skip locked pages during the first two passes to give the
 		 * functions holding the lock time to release the page. Later we
 		 * use lock_page() to have a higher chance of acquiring the
 		 * lock.
 		 */
+		rc = -EAGAIN;
 		if (pass > 2)
 			lock_page(page);
 		else
 			if (TestSetPageLocked(page))
-				goto retry_later;
+				goto next;
 
 		/*
 		 * Only wait on writeback if we have already done a pass where
@@ -713,18 +713,19 @@ redo:
 		if (pass > 0) {
 			wait_on_page_writeback(page);
 		} else {
-			if (PageWriteback(page)) {
-				unlock_page(page);
-				goto retry_later;
-			}
+			if (PageWriteback(page))
+				goto unlock_page;
 		}
 
+		/*
+		 * Anonymous pages must have swap cache references otherwise
+		 * the information contained in the page maps cannot be
+		 * preserved.
+		 */
 		if (PageAnon(page) && !PageSwapCache(page)) {
 			if (!add_to_swap(page, GFP_KERNEL)) {
-				unlock_page(page);
-				list_move(&page->lru, failed);
-				nr_failed++;
-				continue;
+				rc = -ENOMEM;
+				goto unlock_page;
 			}
 		}
 
@@ -732,12 +733,23 @@ redo:
 		 * Page is properly locked and writeback is complete.
 		 * Try to migrate the page.
 		 */
-		if (!swap_page(page)) {
+		rc = swap_page(page);
+		goto next;
+
+unlock_page:
+		unlock_page(page);
+
+next:
+		if (rc == -EAGAIN) {
+			retry++;
+		} else if (rc) {
+			/* Permanent failure */
+			list_move(&page->lru, failed);
+			nr_failed++;
+		} else {
+			/* Success */
 			list_move(&page->lru, moved);
-			continue;
 		}
-retry_later:
-		retry++;
 	}
 	if (retry && pass++ < 10)
 		goto redo;
-- 
cgit v1.2.2


From 45b07ef31d1182d2cfde7711327e3afb268bb1ac Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:00:56 -0800
Subject: [PATCH] cpusets: swap migration interface

Add a boolean "memory_migrate" to each cpuset, represented by a file
containing "0" or "1" in each directory below /dev/cpuset.

It defaults to false (file contains "0").  It can be set true by writing
"1" to the file.

If true, then anytime that a task is attached to the cpuset so marked, the
pages of that task will be moved to that cpuset, preserving, to the extent
practical, the cpuset-relative placement of the pages.

Also anytime that a cpuset so marked has its memory placement changed (by
writing to its "mems" file), the tasks in that cpuset will have their pages
moved to the cpusets new nodes, preserving, to the extent practical, the
cpuset-relative placement of the moved pages.

Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Christoph Lameter <christoph@lameter.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/cpusets.txt | 25 +++++++++++++++++++++++++
 include/linux/mempolicy.h |  7 +++++++
 kernel/cpuset.c           | 38 ++++++++++++++++++++++++++++++++++++--
 3 files changed, 68 insertions(+), 2 deletions(-)

diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt
index a09a8eb806..e2d9afc30d 100644
--- a/Documentation/cpusets.txt
+++ b/Documentation/cpusets.txt
@@ -192,6 +192,7 @@ containing the following files describing that cpuset:
 
  - cpus: list of CPUs in that cpuset
  - mems: list of Memory Nodes in that cpuset
+ - memory_migrate flag: if set, move pages to cpusets nodes
  - cpu_exclusive flag: is cpu placement exclusive?
  - mem_exclusive flag: is memory placement exclusive?
  - tasks: list of tasks (by pid) attached to that cpuset
@@ -277,6 +278,30 @@ rewritten to the 'tasks' file of its cpuset.  This is done to avoid
 impacting the scheduler code in the kernel with a check for changes
 in a tasks processor placement.
 
+Normally, once a page is allocated (given a physical page
+of main memory) then that page stays on whatever node it
+was allocated, so long as it remains allocated, even if the
+cpusets memory placement policy 'mems' subsequently changes.
+If the cpuset flag file 'memory_migrate' is set true, then when
+tasks are attached to that cpuset, any pages that task had
+allocated to it on nodes in its previous cpuset are migrated
+to the tasks new cpuset.  Depending on the implementation,
+this migration may either be done by swapping the page out,
+so that the next time the page is referenced, it will be paged
+into the tasks new cpuset, usually on the node where it was
+referenced, or this migration may be done by directly copying
+the pages from the tasks previous cpuset to the new cpuset,
+where possible to the same node, relative to the new cpuset,
+as the node that held the page, relative to the old cpuset.
+Also if 'memory_migrate' is set true, then if that cpusets
+'mems' file is modified, pages allocated to tasks in that
+cpuset, that were on nodes in the previous setting of 'mems',
+will be moved to nodes in the new setting of 'mems.'  Again,
+depending on the implementation, this might be done by swapping,
+or by direct copying.  In either case, pages that were not in
+the tasks prior cpuset, or in the cpusets prior 'mems' setting,
+will not be moved.
+
 There is an exception to the above.  If hotplug functionality is used
 to remove all the CPUs that are currently assigned to a cpuset,
 then the kernel will automatically update the cpus_allowed of all
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 3e61e82968..66247eff24 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -235,6 +235,13 @@ static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
 	return NODE_DATA(0)->node_zonelists + gfp_zone(GFP_HIGHUSER);
 }
 
+static inline int do_migrate_pages(struct mm_struct *mm,
+			const nodemask_t *from_nodes,
+			const nodemask_t *to_nodes, int flags)
+{
+	return 0;
+}
+
 static inline void check_highest_zone(int k)
 {
 }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 7430640f98..f63383e01e 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -87,6 +87,7 @@ struct cpuset {
 typedef enum {
 	CS_CPU_EXCLUSIVE,
 	CS_MEM_EXCLUSIVE,
+	CS_MEMORY_MIGRATE,
 	CS_REMOVED,
 	CS_NOTIFY_ON_RELEASE
 } cpuset_flagbits_t;
@@ -112,6 +113,11 @@ static inline int notify_on_release(const struct cpuset *cs)
 	return !!test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
 }
 
+static inline int is_memory_migrate(const struct cpuset *cs)
+{
+	return !!test_bit(CS_MEMORY_MIGRATE, &cs->flags);
+}
+
 /*
  * Increment this atomic integer everytime any cpuset changes its
  * mems_allowed value.  Users of cpusets can track this generation
@@ -602,16 +608,24 @@ static void refresh_mems(void)
 	if (current->cpuset_mems_generation != my_cpusets_mem_gen) {
 		struct cpuset *cs;
 		nodemask_t oldmem = current->mems_allowed;
+		int migrate;
 
 		down(&callback_sem);
 		task_lock(current);
 		cs = current->cpuset;
+		migrate = is_memory_migrate(cs);
 		guarantee_online_mems(cs, &current->mems_allowed);
 		current->cpuset_mems_generation = cs->mems_generation;
 		task_unlock(current);
 		up(&callback_sem);
-		if (!nodes_equal(oldmem, current->mems_allowed))
+		if (!nodes_equal(oldmem, current->mems_allowed)) {
 			numa_policy_rebind(&oldmem, &current->mems_allowed);
+			if (migrate) {
+				do_migrate_pages(current->mm, &oldmem,
+					&current->mems_allowed,
+					MPOL_MF_MOVE_ALL);
+			}
+		}
 	}
 }
 
@@ -795,7 +809,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
 /*
  * update_flag - read a 0 or a 1 in a file and update associated flag
  * bit:	the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
- *						CS_NOTIFY_ON_RELEASE)
+ *				CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE)
  * cs:	the cpuset to update
  * buf:	the buffer where we read the 0 or 1
  *
@@ -848,6 +862,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
 	struct task_struct *tsk;
 	struct cpuset *oldcs;
 	cpumask_t cpus;
+	nodemask_t from, to;
 
 	if (sscanf(pidbuf, "%d", &pid) != 1)
 		return -EIO;
@@ -893,7 +908,12 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
 	guarantee_online_cpus(cs, &cpus);
 	set_cpus_allowed(tsk, cpus);
 
+	from = oldcs->mems_allowed;
+	to = cs->mems_allowed;
+
 	up(&callback_sem);
+	if (is_memory_migrate(cs))
+		do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL);
 	put_task_struct(tsk);
 	if (atomic_dec_and_test(&oldcs->count))
 		check_for_release(oldcs, ppathbuf);
@@ -905,6 +925,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
 typedef enum {
 	FILE_ROOT,
 	FILE_DIR,
+	FILE_MEMORY_MIGRATE,
 	FILE_CPULIST,
 	FILE_MEMLIST,
 	FILE_CPU_EXCLUSIVE,
@@ -960,6 +981,9 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
 	case FILE_NOTIFY_ON_RELEASE:
 		retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer);
 		break;
+	case FILE_MEMORY_MIGRATE:
+		retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer);
+		break;
 	case FILE_TASKLIST:
 		retval = attach_task(cs, buffer, &pathbuf);
 		break;
@@ -1060,6 +1084,9 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
 	case FILE_NOTIFY_ON_RELEASE:
 		*s++ = notify_on_release(cs) ? '1' : '0';
 		break;
+	case FILE_MEMORY_MIGRATE:
+		*s++ = is_memory_migrate(cs) ? '1' : '0';
+		break;
 	default:
 		retval = -EINVAL;
 		goto out;
@@ -1408,6 +1435,11 @@ static struct cftype cft_notify_on_release = {
 	.private = FILE_NOTIFY_ON_RELEASE,
 };
 
+static struct cftype cft_memory_migrate = {
+	.name = "memory_migrate",
+	.private = FILE_MEMORY_MIGRATE,
+};
+
 static int cpuset_populate_dir(struct dentry *cs_dentry)
 {
 	int err;
@@ -1422,6 +1454,8 @@ static int cpuset_populate_dir(struct dentry *cs_dentry)
 		return err;
 	if ((err = cpuset_add_file(cs_dentry, &cft_notify_on_release)) < 0)
 		return err;
+	if ((err = cpuset_add_file(cs_dentry, &cft_memory_migrate)) < 0)
+		return err;
 	if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0)
 		return err;
 	return 0;
-- 
cgit v1.2.2


From aea47ff363c15b0be5fc27ed991b1fdee338f0a7 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Sun, 8 Jan 2006 01:00:57 -0800
Subject: [PATCH] mm: make hugepages obey cpusets.

See http://marc.theaimsgroup.com/?l=linux-kernel&m=113167000201265&w=2
http://marc.theaimsgroup.com/?l=linux-mm&m=113167267527312&w=2

Make hugepages obey cpusets.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Acked-by: William Irwin <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/hugetlb.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f4c43d7980..b21d78c941 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -12,6 +12,7 @@
 #include <linux/nodemask.h>
 #include <linux/pagemap.h>
 #include <linux/mempolicy.h>
+#include <linux/cpuset.h>
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -48,7 +49,8 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
 
 	for (z = zonelist->zones; *z; z++) {
 		nid = (*z)->zone_pgdat->node_id;
-		if (!list_empty(&hugepage_freelists[nid]))
+		if (cpuset_zone_allowed(*z, GFP_HIGHUSER) &&
+		    !list_empty(&hugepage_freelists[nid]))
 			break;
 	}
 
-- 
cgit v1.2.2


From 152194aaa6266d71dfee57882a23def339ef17a4 Mon Sep 17 00:00:00 2001
From: Avishay Traeger <atraeger@cs.sunysb.edu>
Date: Sun, 8 Jan 2006 01:00:58 -0800
Subject: [PATCH] set_page_count() macro safety

Fix set_page_count() macro to handle complex arguments.

Signed-off-by: Avishay Traeger <atraeger@cs.sunysb.edu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 83c651f251..7ff54242c5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -308,7 +308,7 @@ struct page {
  */
 #define get_page_testone(p)	atomic_inc_and_test(&(p)->_count)
 
-#define set_page_count(p,v) 	atomic_set(&(p)->_count, v - 1)
+#define set_page_count(p,v) 	atomic_set(&(p)->_count, (v) - 1)
 #define __put_page(p)		atomic_dec(&(p)->_count)
 
 extern void FASTCALL(__page_cache_release(struct page *));
-- 
cgit v1.2.2


From cd105df4590c89837a1c300843238148cfef9b5f Mon Sep 17 00:00:00 2001
From: Tobias Klauser <tklauser@nuerscht.ch>
Date: Sun, 8 Jan 2006 01:00:59 -0800
Subject: [PATCH] mm: clean up local variables

Clean up a local variable with the same name as a variable in a larger
block.  Also move a variable into the block where it's actually used.

Spotted by http://linuxicc.sourceforge.net/

Signed-off-by: Tobias Klauser <tklauser@nuerscht.ch>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c     |  3 ++-
 mm/swapfile.c | 12 ++++++------
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/mm/slab.c b/mm/slab.c
index f71d8be2f4..76b092bd0b 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -854,7 +854,6 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
 	struct kmem_list3 *l3 = NULL;
 	int node = cpu_to_node(cpu);
 	int memsize = sizeof(struct kmem_list3);
-	struct array_cache *nc = NULL;
 
 	switch (action) {
 	case CPU_UP_PREPARE:
@@ -891,6 +890,8 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
 		/* Now we can go ahead with allocating the shared array's
 		   & array cache's */
 		list_for_each_entry(cachep, &cache_chain, next) {
+			struct array_cache *nc;
+
 			nc = alloc_arraycache(node, cachep->limit,
 					      cachep->batchcount);
 			if (!nc)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6da4b28b89..80f948a202 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1493,7 +1493,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 			goto bad_swap;
 		if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
 			goto bad_swap;
-		
+
 		/* OK, set up the swap map and apply the bad block list */
 		if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) {
 			error = -ENOMEM;
@@ -1502,17 +1502,17 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 
 		error = 0;
 		memset(p->swap_map, 0, maxpages * sizeof(short));
-		for (i=0; i<swap_header->info.nr_badpages; i++) {
-			int page = swap_header->info.badpages[i];
-			if (page <= 0 || page >= swap_header->info.last_page)
+		for (i = 0; i < swap_header->info.nr_badpages; i++) {
+			int page_nr = swap_header->info.badpages[i];
+			if (page_nr <= 0 || page_nr >= swap_header->info.last_page)
 				error = -EINVAL;
 			else
-				p->swap_map[page] = SWAP_MAP_BAD;
+				p->swap_map[page_nr] = SWAP_MAP_BAD;
 		}
 		nr_good_pages = swap_header->info.last_page -
 				swap_header->info.nr_badpages -
 				1 /* header page */;
-		if (error) 
+		if (error)
 			goto bad_swap;
 	}
 
-- 
cgit v1.2.2


From ef2bf0dc8675e14cf8cba3b7fb9f48d72640a70e Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Sun, 8 Jan 2006 01:01:00 -0800
Subject: [PATCH] rmap: additional diagnostics in page_remove_rmap()

We seem to be hitting this assertion failure too often for it to be
hardware bugs.

Cc: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/rmap.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/mm/rmap.c b/mm/rmap.c
index 6f3f7db271..66ec43053a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -514,6 +514,13 @@ void page_add_file_rmap(struct page *page)
 void page_remove_rmap(struct page *page)
 {
 	if (atomic_add_negative(-1, &page->_mapcount)) {
+		if (page_mapcount(page) < 0) {
+			printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page));
+			printk (KERN_EMERG "  page->flags = %lx\n", page->flags);
+			printk (KERN_EMERG "  page->count = %x\n", page_count(page));
+			printk (KERN_EMERG "  page->mapping = %p\n", page->mapping);
+		}
+
 		BUG_ON(page_mapcount(page) < 0);
 		/*
 		 * It would be tidy to reset the PageAnon mapping here,
-- 
cgit v1.2.2


From 38e35860dbe6197a4b42eb6e8b47da940b7695dd Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 8 Jan 2006 01:01:01 -0800
Subject: [PATCH] mempolicies: private pointer in check_range and
 MPOL_MF_INVERT

This was was first posted at
http://marc.theaimsgroup.com/?l=linux-mm&m=113149240227584&w=2

(Part of this functionality is also contained in the direct migration
pathset. The functionality here is more generic and independent of that
patchset.)

- Add internal flags MPOL_MF_INVERT to control check_range() behavior.

- Replace the pagelist passed through by check_range by a general
  private pointer that may be used for other purposes.
  (The following patches will use that to merge numa_maps into
  mempolicy.c and to better group the page migration code in
  the policy layer)

- Improve some comments.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mempolicy.c | 46 ++++++++++++++++++++++++----------------------
 1 file changed, 24 insertions(+), 22 deletions(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 30bdafba52..270e9a39ec 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -88,8 +88,9 @@
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
 
-/* Internal MPOL_MF_xxx flags */
+/* Internal flags */
 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
+#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
 
 static kmem_cache_t *policy_cache;
 static kmem_cache_t *sn_cache;
@@ -227,11 +228,11 @@ static void migrate_page_add(struct vm_area_struct *vma,
 	}
 }
 
-/* Ensure all existing pages follow the policy. */
+/* Scan through pages checking if pages follow certain conditions. */
 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 		unsigned long addr, unsigned long end,
 		const nodemask_t *nodes, unsigned long flags,
-		struct list_head *pagelist)
+		void *private)
 {
 	pte_t *orig_pte;
 	pte_t *pte;
@@ -248,12 +249,13 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 		if (!page)
 			continue;
 		nid = page_to_nid(page);
-		if (!node_isset(nid, *nodes)) {
-			if (pagelist)
-				migrate_page_add(vma, page, pagelist, flags);
-			else
-				break;
-		}
+		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
+			continue;
+
+		if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+			migrate_page_add(vma, page, private, flags);
+		else
+			break;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 	pte_unmap_unlock(orig_pte, ptl);
 	return addr != end;
@@ -262,7 +264,7 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 		unsigned long addr, unsigned long end,
 		const nodemask_t *nodes, unsigned long flags,
-		struct list_head *pagelist)
+		void *private)
 {
 	pmd_t *pmd;
 	unsigned long next;
@@ -273,7 +275,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 		if (pmd_none_or_clear_bad(pmd))
 			continue;
 		if (check_pte_range(vma, pmd, addr, next, nodes,
-				    flags, pagelist))
+				    flags, private))
 			return -EIO;
 	} while (pmd++, addr = next, addr != end);
 	return 0;
@@ -282,7 +284,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 		unsigned long addr, unsigned long end,
 		const nodemask_t *nodes, unsigned long flags,
-		struct list_head *pagelist)
+		void *private)
 {
 	pud_t *pud;
 	unsigned long next;
@@ -293,7 +295,7 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 		if (pud_none_or_clear_bad(pud))
 			continue;
 		if (check_pmd_range(vma, pud, addr, next, nodes,
-				    flags, pagelist))
+				    flags, private))
 			return -EIO;
 	} while (pud++, addr = next, addr != end);
 	return 0;
@@ -302,7 +304,7 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 static inline int check_pgd_range(struct vm_area_struct *vma,
 		unsigned long addr, unsigned long end,
 		const nodemask_t *nodes, unsigned long flags,
-		struct list_head *pagelist)
+		void *private)
 {
 	pgd_t *pgd;
 	unsigned long next;
@@ -313,7 +315,7 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
 		if (pgd_none_or_clear_bad(pgd))
 			continue;
 		if (check_pud_range(vma, pgd, addr, next, nodes,
-				    flags, pagelist))
+				    flags, private))
 			return -EIO;
 	} while (pgd++, addr = next, addr != end);
 	return 0;
@@ -335,8 +337,7 @@ static inline int vma_migratable(struct vm_area_struct *vma)
  */
 static struct vm_area_struct *
 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
-		const nodemask_t *nodes, unsigned long flags,
-		struct list_head *pagelist)
+		const nodemask_t *nodes, unsigned long flags, void *private)
 {
 	int err;
 	struct vm_area_struct *first, *vma, *prev;
@@ -363,7 +364,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 			if (vma->vm_start > start)
 				start = vma->vm_start;
 			err = check_pgd_range(vma, start, endvma, nodes,
-						flags, pagelist);
+						flags, private);
 			if (err) {
 				first = ERR_PTR(err);
 				break;
@@ -452,7 +453,8 @@ long do_mbind(unsigned long start, unsigned long len,
 	int err;
 	LIST_HEAD(pagelist);
 
-	if ((flags & ~(unsigned long)(MPOL_MF_STRICT|MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
+	if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
+				      MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 	    || mode > MPOL_MAX)
 		return -EINVAL;
 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
@@ -490,8 +492,9 @@ long do_mbind(unsigned long start, unsigned long len,
 			mode,nodes_addr(nodes)[0]);
 
 	down_write(&mm->mmap_sem);
-	vma = check_range(mm, start, end, nmask, flags,
-	      (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ? &pagelist : NULL);
+	vma = check_range(mm, start, end, nmask,
+			  flags | MPOL_MF_INVERT, &pagelist);
+
 	err = PTR_ERR(vma);
 	if (!IS_ERR(vma)) {
 		int nr_failed = 0;
@@ -646,7 +649,6 @@ int do_migrate_pages(struct mm_struct *mm,
 	nodemask_t nodes;
 
 	nodes_andnot(nodes, *from_nodes, *to_nodes);
-	nodes_complement(nodes, nodes);
 
 	down_read(&mm->mmap_sem);
 	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
-- 
cgit v1.2.2


From 1a75a6c825c17249ca49f050a872a04ce0997ce3 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 8 Jan 2006 01:01:02 -0800
Subject: [PATCH] Fold numa_maps into mempolicies.c

First discussed at http://marc.theaimsgroup.com/?t=113149255100001&r=1&w=2

- Use the check_range() in mempolicy.c to gather statistics.

- Improve the numa_maps code in general and fix some comments.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/task_mmu.c | 127 ++----------------------------------------------
 mm/mempolicy.c     | 138 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 142 insertions(+), 123 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 50bd5a8f04..0eaad41f46 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -390,129 +390,12 @@ struct seq_operations proc_pid_smaps_op = {
 };
 
 #ifdef CONFIG_NUMA
-
-struct numa_maps {
-	unsigned long pages;
-	unsigned long anon;
-	unsigned long mapped;
-	unsigned long mapcount_max;
-	unsigned long node[MAX_NUMNODES];
-};
-
-/*
- * Calculate numa node maps for a vma
- */
-static struct numa_maps *get_numa_maps(struct vm_area_struct *vma)
-{
-	int i;
-	struct page *page;
-	unsigned long vaddr;
-	struct numa_maps *md = kmalloc(sizeof(struct numa_maps), GFP_KERNEL);
-
-	if (!md)
-		return NULL;
-	md->pages = 0;
-	md->anon = 0;
-	md->mapped = 0;
-	md->mapcount_max = 0;
-	for_each_node(i)
-		md->node[i] =0;
-
- 	for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) {
-		page = follow_page(vma, vaddr, 0);
-		if (page) {
-			int count = page_mapcount(page);
-
-			if (count)
-				md->mapped++;
-			if (count > md->mapcount_max)
-				md->mapcount_max = count;
-			md->pages++;
-			if (PageAnon(page))
-				md->anon++;
-			md->node[page_to_nid(page)]++;
-		}
-		cond_resched();
-	}
-	return md;
-}
-
-static int show_numa_map(struct seq_file *m, void *v)
-{
-	struct task_struct *task = m->private;
-	struct vm_area_struct *vma = v;
-	struct mempolicy *pol;
-	struct numa_maps *md;
-	struct zone **z;
-	int n;
-	int first;
-
-	if (!vma->vm_mm)
-		return 0;
-
-	md = get_numa_maps(vma);
-	if (!md)
-		return 0;
-
-	seq_printf(m, "%08lx", vma->vm_start);
-	pol = get_vma_policy(task, vma, vma->vm_start);
-	/* Print policy */
-	switch (pol->policy) {
-	case MPOL_PREFERRED:
-		seq_printf(m, " prefer=%d", pol->v.preferred_node);
-		break;
-	case MPOL_BIND:
-		seq_printf(m, " bind={");
-		first = 1;
-		for (z = pol->v.zonelist->zones; *z; z++) {
-
-			if (!first)
-				seq_putc(m, ',');
-			else
-				first = 0;
-			seq_printf(m, "%d/%s", (*z)->zone_pgdat->node_id,
-					(*z)->name);
-		}
-		seq_putc(m, '}');
-		break;
-	case MPOL_INTERLEAVE:
-		seq_printf(m, " interleave={");
-		first = 1;
-		for_each_node(n) {
-			if (node_isset(n, pol->v.nodes)) {
-				if (!first)
-					seq_putc(m,',');
-				else
-					first = 0;
-				seq_printf(m, "%d",n);
-			}
-		}
-		seq_putc(m, '}');
-		break;
-	default:
-		seq_printf(m," default");
-		break;
-	}
-	seq_printf(m, " MaxRef=%lu Pages=%lu Mapped=%lu",
-			md->mapcount_max, md->pages, md->mapped);
-	if (md->anon)
-		seq_printf(m," Anon=%lu",md->anon);
-
-	for_each_online_node(n) {
-		if (md->node[n])
-			seq_printf(m, " N%d=%lu", n, md->node[n]);
-	}
-	seq_putc(m, '\n');
-	kfree(md);
-	if (m->count < m->size)  /* vma is copied successfully */
-		m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
-	return 0;
-}
+extern int show_numa_map(struct seq_file *m, void *v);
 
 struct seq_operations proc_pid_numa_maps_op = {
-	.start	= m_start,
-	.next	= m_next,
-	.stop	= m_stop,
-	.show	= show_numa_map
+        .start  = m_start,
+        .next   = m_next,
+        .stop   = m_stop,
+        .show   = show_numa_map
 };
 #endif
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 270e9a39ec..44b9d69900 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -84,6 +84,8 @@
 #include <linux/compat.h>
 #include <linux/mempolicy.h>
 #include <linux/swap.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
 
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
@@ -91,6 +93,7 @@
 /* Internal flags */
 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
+#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)		/* Gather statistics */
 
 static kmem_cache_t *policy_cache;
 static kmem_cache_t *sn_cache;
@@ -228,6 +231,8 @@ static void migrate_page_add(struct vm_area_struct *vma,
 	}
 }
 
+static void gather_stats(struct page *, void *);
+
 /* Scan through pages checking if pages follow certain conditions. */
 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 		unsigned long addr, unsigned long end,
@@ -252,7 +257,9 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 			continue;
 
-		if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+		if (flags & MPOL_MF_STATS)
+			gather_stats(page, private);
+		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 			migrate_page_add(vma, page, private, flags);
 		else
 			break;
@@ -1460,3 +1467,132 @@ void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new)
 {
 	rebind_policy(current->mempolicy, old, new);
 }
+
+/*
+ * Display pages allocated per node and memory policy via /proc.
+ */
+
+static const char *policy_types[] = { "default", "prefer", "bind",
+				      "interleave" };
+
+/*
+ * Convert a mempolicy into a string.
+ * Returns the number of characters in buffer (if positive)
+ * or an error (negative)
+ */
+static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
+{
+	char *p = buffer;
+	int l;
+	nodemask_t nodes;
+	int mode = pol ? pol->policy : MPOL_DEFAULT;
+
+	switch (mode) {
+	case MPOL_DEFAULT:
+		nodes_clear(nodes);
+		break;
+
+	case MPOL_PREFERRED:
+		nodes_clear(nodes);
+		node_set(pol->v.preferred_node, nodes);
+		break;
+
+	case MPOL_BIND:
+		get_zonemask(pol, &nodes);
+		break;
+
+	case MPOL_INTERLEAVE:
+		nodes = pol->v.nodes;
+		break;
+
+	default:
+		BUG();
+		return -EFAULT;
+	}
+
+	l = strlen(policy_types[mode]);
+ 	if (buffer + maxlen < p + l + 1)
+ 		return -ENOSPC;
+
+	strcpy(p, policy_types[mode]);
+	p += l;
+
+	if (!nodes_empty(nodes)) {
+		if (buffer + maxlen < p + 2)
+			return -ENOSPC;
+		*p++ = '=';
+	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
+	}
+	return p - buffer;
+}
+
+struct numa_maps {
+	unsigned long pages;
+	unsigned long anon;
+	unsigned long mapped;
+	unsigned long mapcount_max;
+	unsigned long node[MAX_NUMNODES];
+};
+
+static void gather_stats(struct page *page, void *private)
+{
+	struct numa_maps *md = private;
+	int count = page_mapcount(page);
+
+	if (count)
+		md->mapped++;
+
+	if (count > md->mapcount_max)
+		md->mapcount_max = count;
+
+	md->pages++;
+
+	if (PageAnon(page))
+		md->anon++;
+
+	md->node[page_to_nid(page)]++;
+	cond_resched();
+}
+
+int show_numa_map(struct seq_file *m, void *v)
+{
+	struct task_struct *task = m->private;
+	struct vm_area_struct *vma = v;
+	struct numa_maps *md;
+	int n;
+	char buffer[50];
+
+	if (!vma->vm_mm)
+		return 0;
+
+	md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
+	if (!md)
+		return 0;
+
+	check_pgd_range(vma, vma->vm_start, vma->vm_end,
+		    &node_online_map, MPOL_MF_STATS, md);
+
+	if (md->pages) {
+		mpol_to_str(buffer, sizeof(buffer),
+			    get_vma_policy(task, vma, vma->vm_start));
+
+		seq_printf(m, "%08lx %s pages=%lu mapped=%lu maxref=%lu",
+			   vma->vm_start, buffer, md->pages,
+			   md->mapped, md->mapcount_max);
+
+		if (md->anon)
+			seq_printf(m," anon=%lu",md->anon);
+
+		for_each_online_node(n)
+			if (md->node[n])
+				seq_printf(m, " N%d=%lu", n, md->node[n]);
+
+		seq_putc(m, '\n');
+	}
+	kfree(md);
+
+	if (m->count < m->size)
+		m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
+	return 0;
+}
+
-- 
cgit v1.2.2


From 132beacf9785d2e6e8aecb59aa078f3ca5668fa6 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Sun, 8 Jan 2006 01:01:02 -0800
Subject: [PATCH] Drop page table lock before calling migrate_page_add()

migrate_page_add cannot be called with a spinlock held (calls
isolate_lru_page which calles schedule_on_each_cpu).  Drop ptl lock in
check_pte_range before calling migrate_page_add().

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mempolicy.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 44b9d69900..4c0510e9e7 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -259,8 +259,11 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 
 		if (flags & MPOL_MF_STATS)
 			gather_stats(page, private);
-		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
+			spin_unlock(ptl);
 			migrate_page_add(vma, page, private, flags);
+			spin_lock(ptl);
+		}
 		else
 			break;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
-- 
cgit v1.2.2


From 48fce3429df84a94766fbbc845fa8450d0715b48 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 8 Jan 2006 01:01:03 -0800
Subject: [PATCH] mempolicies: unexport get_vma_policy()

Since the numa_maps functionality is now in mempolicy.c we no longer need to
export get_vma_policy().

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mempolicy.h | 3 ---
 mm/mempolicy.c            | 4 ++--
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 66247eff24..05fddd5bee 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -144,9 +144,6 @@ void mpol_free_shared_policy(struct shared_policy *p);
 struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
 					    unsigned long idx);
 
-struct mempolicy *get_vma_policy(struct task_struct *task,
-			struct vm_area_struct *vma, unsigned long addr);
-
 extern void numa_default_policy(void);
 extern void numa_policy_init(void);
 extern void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4c0510e9e7..4b077ec6c0 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -935,8 +935,8 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
 #endif
 
 /* Return effective policy for a VMA */
-struct mempolicy *
-get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
+static struct mempolicy * get_vma_policy(struct task_struct *task,
+		struct vm_area_struct *vma, unsigned long addr)
 {
 	struct mempolicy *pol = task->mempolicy;
 
-- 
cgit v1.2.2


From 6ce3c4c0ff62ca6391019b7832fb41a7f28b9e26 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 8 Jan 2006 01:01:04 -0800
Subject: [PATCH] Move page migration related functions near do_migrate_pages()

Group page migration functions in mempolicy.c

Add a forward declaration for migrate_page_add (like gather_stats()) and use
our new found mobility to group all page migration related function around
do_migrate_pages().

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mempolicy.c | 270 +++++++++++++++++++++++++++++----------------------------
 1 file changed, 138 insertions(+), 132 deletions(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4b077ec6c0..7051fe450e 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -183,55 +183,9 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 	return policy;
 }
 
-/* Check if we are the only process mapping the page in question */
-static inline int single_mm_mapping(struct mm_struct *mm,
-			struct address_space *mapping)
-{
-	struct vm_area_struct *vma;
-	struct prio_tree_iter iter;
-	int rc = 1;
-
-	spin_lock(&mapping->i_mmap_lock);
-	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
-		if (mm != vma->vm_mm) {
-			rc = 0;
-			goto out;
-		}
-	list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
-		if (mm != vma->vm_mm) {
-			rc = 0;
-			goto out;
-		}
-out:
-	spin_unlock(&mapping->i_mmap_lock);
-	return rc;
-}
-
-/*
- * Add a page to be migrated to the pagelist
- */
-static void migrate_page_add(struct vm_area_struct *vma,
-	struct page *page, struct list_head *pagelist, unsigned long flags)
-{
-	/*
-	 * Avoid migrating a page that is shared by others and not writable.
-	 */
-	if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
-	    mapping_writably_mapped(page->mapping) ||
-	    single_mm_mapping(vma->vm_mm, page->mapping)) {
-		int rc = isolate_lru_page(page);
-
-		if (rc == 1)
-			list_add(&page->lru, pagelist);
-		/*
-		 * If the isolate attempt was not successful then we just
-		 * encountered an unswappable page. Something must be wrong.
-	 	 */
-		WARN_ON(rc == 0);
-	}
-}
-
 static void gather_stats(struct page *, void *);
+static void migrate_page_add(struct vm_area_struct *vma,
+	struct page *page, struct list_head *pagelist, unsigned long flags);
 
 /* Scan through pages checking if pages follow certain conditions. */
 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
@@ -440,90 +394,6 @@ static int contextualize_policy(int mode, nodemask_t *nodes)
 	return mpol_check_policy(mode, nodes);
 }
 
-static int swap_pages(struct list_head *pagelist)
-{
-	LIST_HEAD(moved);
-	LIST_HEAD(failed);
-	int n;
-
-	n = migrate_pages(pagelist, NULL, &moved, &failed);
-	putback_lru_pages(&failed);
-	putback_lru_pages(&moved);
-
-	return n;
-}
-
-long do_mbind(unsigned long start, unsigned long len,
-		unsigned long mode, nodemask_t *nmask, unsigned long flags)
-{
-	struct vm_area_struct *vma;
-	struct mm_struct *mm = current->mm;
-	struct mempolicy *new;
-	unsigned long end;
-	int err;
-	LIST_HEAD(pagelist);
-
-	if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
-				      MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
-	    || mode > MPOL_MAX)
-		return -EINVAL;
-	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
-		return -EPERM;
-
-	if (start & ~PAGE_MASK)
-		return -EINVAL;
-
-	if (mode == MPOL_DEFAULT)
-		flags &= ~MPOL_MF_STRICT;
-
-	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
-	end = start + len;
-
-	if (end < start)
-		return -EINVAL;
-	if (end == start)
-		return 0;
-
-	if (mpol_check_policy(mode, nmask))
-		return -EINVAL;
-
-	new = mpol_new(mode, nmask);
-	if (IS_ERR(new))
-		return PTR_ERR(new);
-
-	/*
-	 * If we are using the default policy then operation
-	 * on discontinuous address spaces is okay after all
-	 */
-	if (!new)
-		flags |= MPOL_MF_DISCONTIG_OK;
-
-	PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
-			mode,nodes_addr(nodes)[0]);
-
-	down_write(&mm->mmap_sem);
-	vma = check_range(mm, start, end, nmask,
-			  flags | MPOL_MF_INVERT, &pagelist);
-
-	err = PTR_ERR(vma);
-	if (!IS_ERR(vma)) {
-		int nr_failed = 0;
-
-		err = mbind_range(vma, start, end, new);
-		if (!list_empty(&pagelist))
-			nr_failed = swap_pages(&pagelist);
-
-		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
-			err = -EIO;
-	}
-	if (!list_empty(&pagelist))
-		putback_lru_pages(&pagelist);
-
-	up_write(&mm->mmap_sem);
-	mpol_free(new);
-	return err;
-}
-
 /* Set the process memory policy */
 long do_set_mempolicy(int mode, nodemask_t *nodes)
 {
@@ -643,6 +513,71 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
 	return err;
 }
 
+/*
+ * page migration
+ */
+
+/* Check if we are the only process mapping the page in question */
+static inline int single_mm_mapping(struct mm_struct *mm,
+			struct address_space *mapping)
+{
+	struct vm_area_struct *vma;
+	struct prio_tree_iter iter;
+	int rc = 1;
+
+	spin_lock(&mapping->i_mmap_lock);
+	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
+		if (mm != vma->vm_mm) {
+			rc = 0;
+			goto out;
+		}
+	list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
+		if (mm != vma->vm_mm) {
+			rc = 0;
+			goto out;
+		}
+out:
+	spin_unlock(&mapping->i_mmap_lock);
+	return rc;
+}
+
+/*
+ * Add a page to be migrated to the pagelist
+ */
+static void migrate_page_add(struct vm_area_struct *vma,
+	struct page *page, struct list_head *pagelist, unsigned long flags)
+{
+	/*
+	 * Avoid migrating a page that is shared by others and not writable.
+	 */
+	if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
+	    mapping_writably_mapped(page->mapping) ||
+	    single_mm_mapping(vma->vm_mm, page->mapping)) {
+		int rc = isolate_lru_page(page);
+
+		if (rc == 1)
+			list_add(&page->lru, pagelist);
+		/*
+		 * If the isolate attempt was not successful then we just
+		 * encountered an unswappable page. Something must be wrong.
+	 	 */
+		WARN_ON(rc == 0);
+	}
+}
+
+static int swap_pages(struct list_head *pagelist)
+{
+	LIST_HEAD(moved);
+	LIST_HEAD(failed);
+	int n;
+
+	n = migrate_pages(pagelist, NULL, &moved, &failed);
+	putback_lru_pages(&failed);
+	putback_lru_pages(&moved);
+
+	return n;
+}
+
 /*
  * For now migrate_pages simply swaps out the pages from nodes that are in
  * the source set but not in the target set. In the future, we would
@@ -673,6 +608,77 @@ int do_migrate_pages(struct mm_struct *mm,
 	return count;
 }
 
+long do_mbind(unsigned long start, unsigned long len,
+		unsigned long mode, nodemask_t *nmask, unsigned long flags)
+{
+	struct vm_area_struct *vma;
+	struct mm_struct *mm = current->mm;
+	struct mempolicy *new;
+	unsigned long end;
+	int err;
+	LIST_HEAD(pagelist);
+
+	if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
+				      MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+	    || mode > MPOL_MAX)
+		return -EINVAL;
+	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
+		return -EPERM;
+
+	if (start & ~PAGE_MASK)
+		return -EINVAL;
+
+	if (mode == MPOL_DEFAULT)
+		flags &= ~MPOL_MF_STRICT;
+
+	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
+	end = start + len;
+
+	if (end < start)
+		return -EINVAL;
+	if (end == start)
+		return 0;
+
+	if (mpol_check_policy(mode, nmask))
+		return -EINVAL;
+
+	new = mpol_new(mode, nmask);
+	if (IS_ERR(new))
+		return PTR_ERR(new);
+
+	/*
+	 * If we are using the default policy then operation
+	 * on discontinuous address spaces is okay after all
+	 */
+	if (!new)
+		flags |= MPOL_MF_DISCONTIG_OK;
+
+	PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
+			mode,nodes_addr(nodes)[0]);
+
+	down_write(&mm->mmap_sem);
+	vma = check_range(mm, start, end, nmask,
+			  flags | MPOL_MF_INVERT, &pagelist);
+
+	err = PTR_ERR(vma);
+	if (!IS_ERR(vma)) {
+		int nr_failed = 0;
+
+		err = mbind_range(vma, start, end, new);
+		if (!list_empty(&pagelist))
+			nr_failed = swap_pages(&pagelist);
+
+		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
+			err = -EIO;
+	}
+	if (!list_empty(&pagelist))
+		putback_lru_pages(&pagelist);
+
+	up_write(&mm->mmap_sem);
+	mpol_free(new);
+	return err;
+}
+
 /*
  * User space interface with variable sized bitmaps for nodelists.
  */
-- 
cgit v1.2.2


From 2f659f462d2ab519068d0e2bb677d7a700decb8d Mon Sep 17 00:00:00 2001
From: Kirill Korotaev <dev@sw.ru>
Date: Sun, 8 Jan 2006 01:01:05 -0800
Subject: [PATCH] Optimise oom kill of current task

When oom_killer kills current there's no need to call
schedule_timeout_interruptible() since task must die ASAP.

Signed-Off-By: Pavel Emelianov <xemul@sw.ru>
Signed-Off-By: Kirill Korotaev <dev@openvz.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/oom_kill.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index d348b90359..4748b906af 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -298,7 +298,8 @@ retry:
 
 	/*
 	 * Give "p" a good chance of killing itself before we
-	 * retry to allocate memory.
+	 * retry to allocate memory unless "p" is current
 	 */
-	schedule_timeout_interruptible(1);
+	if (!test_thread_flag(TIF_MEMDIE))
+		schedule_timeout_interruptible(1);
 }
-- 
cgit v1.2.2


From 974dffc2dd72d499a3ff150f61c8025b097a2c34 Mon Sep 17 00:00:00 2001
From: Lennert Buytenhek <buytenh@wantstofly.org>
Date: Sun, 8 Jan 2006 01:01:06 -0800
Subject: [PATCH] cs89x0: make {read,write}word take base_addr

readword() and writeword() take a 'struct net_device *' and deref its
->base_addr member.  Make them take the base_addr directly instead, so
that we can switch the other occurences of inw/outw in the file over
to readword/writeword as well.

Signed-off-by: Lennert Buytenhek <buytenh@wantstofly.org>
Cc: dmitry pervushin <dpervushin@ru.mvista.com>
Cc: <dsaxena@plexity.net>
Cc: Jeff Garzik <jgarzik@pobox.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/net/cs89x0.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/net/cs89x0.c b/drivers/net/cs89x0.c
index a6078ad9b6..29a7c0cf84 100644
--- a/drivers/net/cs89x0.c
+++ b/drivers/net/cs89x0.c
@@ -353,15 +353,15 @@ writereg(struct net_device *dev, int portno, int value)
 }
 
 static int
-readword(struct net_device *dev, int portno)
+readword(unsigned long base_addr, int portno)
 {
-	return inw(dev->base_addr + portno);
+	return inw(base_addr + portno);
 }
 
 static void
-writeword(struct net_device *dev, int portno, int value)
+writeword(unsigned long base_addr, int portno, int value)
 {
-	outw(value, dev->base_addr + portno);
+	outw(value, base_addr + portno);
 }
 
 static int __init
@@ -1104,8 +1104,8 @@ send_test_pkt(struct net_device *dev)
 	memcpy(test_packet,          dev->dev_addr, ETH_ALEN);
 	memcpy(test_packet+ETH_ALEN, dev->dev_addr, ETH_ALEN);
 
-        writeword(dev, TX_CMD_PORT, TX_AFTER_ALL);
-        writeword(dev, TX_LEN_PORT, ETH_ZLEN);
+        writeword(dev->base_addr, TX_CMD_PORT, TX_AFTER_ALL);
+        writeword(dev->base_addr, TX_LEN_PORT, ETH_ZLEN);
 
 	/* Test to see if the chip has allocated memory for the packet */
 	while (jiffies - timenow < 5)
@@ -1457,8 +1457,8 @@ static int net_send_packet(struct sk_buff *skb, struct net_device *dev)
 	netif_stop_queue(dev);
 
 	/* initiate a transmit sequence */
-	writeword(dev, TX_CMD_PORT, lp->send_cmd);
-	writeword(dev, TX_LEN_PORT, skb->len);
+	writeword(dev->base_addr, TX_CMD_PORT, lp->send_cmd);
+	writeword(dev->base_addr, TX_LEN_PORT, skb->len);
 
 	/* Test to see if the chip has allocated memory for the packet */
 	if ((readreg(dev, PP_BusST) & READY_FOR_TX_NOW) == 0) {
@@ -1512,7 +1512,7 @@ static irqreturn_t net_interrupt(int irq, void *dev_id, struct pt_regs * regs)
            course, if you're on a slow machine, and packets are arriving
            faster than you can read them off, you're screwed.  Hasta la
            vista, baby!  */
-	while ((status = readword(dev, ISQ_PORT))) {
+	while ((status = readword(dev->base_addr, ISQ_PORT))) {
 		if (net_debug > 4)printk("%s: event=%04x\n", dev->name, status);
 		handled = 1;
 		switch(status & ISQ_EVENT_MASK) {
-- 
cgit v1.2.2


From fc8c7d79b117f7a5b17640bf657d95afe4025bb8 Mon Sep 17 00:00:00 2001
From: Lennert Buytenhek <buytenh@wantstofly.org>
Date: Sun, 8 Jan 2006 01:01:08 -0800
Subject: [PATCH] cs89x0: convert {inw,outw} calls to {read,write}word

Switch all occurences of inw/outw in the driver over to readword/writeword.

Signed-off-by: Lennert Buytenhek <buytenh@wantstofly.org>
Cc: dmitry pervushin <dpervushin@ru.mvista.com>
Cc: <dsaxena@plexity.net>
Cc: Jeff Garzik <jgarzik@pobox.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/net/cs89x0.c | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/drivers/net/cs89x0.c b/drivers/net/cs89x0.c
index 29a7c0cf84..04fca31f17 100644
--- a/drivers/net/cs89x0.c
+++ b/drivers/net/cs89x0.c
@@ -491,8 +491,8 @@ cs89x0_probe1(struct net_device *dev, int ioaddr, int modular)
 
 #ifdef CONFIG_SH_HICOSH4
 	/* truely reset the chip */
-	outw(0x0114, ioaddr + ADD_PORT);
-	outw(0x0040, ioaddr + DATA_PORT);
+	writeword(ioaddr, ADD_PORT, 0x0114);
+	writeword(ioaddr, DATA_PORT, 0x0040);
 #endif
 
 	/* if they give us an odd I/O address, then do ONE write to
@@ -503,24 +503,24 @@ cs89x0_probe1(struct net_device *dev, int ioaddr, int modular)
 		if (net_debug > 1)
 			printk(KERN_INFO "%s: odd ioaddr 0x%x\n", dev->name, ioaddr);
 	        if ((ioaddr & 2) != 2)
-	        	if ((inw((ioaddr & ~3)+ ADD_PORT) & ADD_MASK) != ADD_SIG) {
+	        	if ((readword(ioaddr & ~3, ADD_PORT) & ADD_MASK) != ADD_SIG) {
 				printk(KERN_ERR "%s: bad signature 0x%x\n",
-					dev->name, inw((ioaddr & ~3)+ ADD_PORT));
+					dev->name, readword(ioaddr & ~3, ADD_PORT));
 		        	retval = -ENODEV;
 				goto out2;
 			}
 	}
-	printk(KERN_DEBUG "PP_addr at %x: 0x%x\n",
-			ioaddr + ADD_PORT, inw(ioaddr + ADD_PORT));
+	printk(KERN_DEBUG "PP_addr at %x[%x]: 0x%x\n",
+			ioaddr, ADD_PORT, readword(ioaddr, ADD_PORT));
 
 	ioaddr &= ~3;
-	outw(PP_ChipID, ioaddr + ADD_PORT);
+	writeword(ioaddr, ADD_PORT, PP_ChipID);
 
-	tmp = inw(ioaddr + DATA_PORT);
+	tmp = readword(ioaddr, DATA_PORT);
 	if (tmp != CHIP_EISA_ID_SIG) {
-		printk(KERN_DEBUG "%s: incorrect signature at %x: 0x%x!="
+		printk(KERN_DEBUG "%s: incorrect signature at %x[%x]: 0x%x!="
 			CHIP_EISA_ID_SIG_STR "\n",
-			dev->name, ioaddr + DATA_PORT, tmp);
+			dev->name, ioaddr, DATA_PORT, tmp);
   		retval = -ENODEV;
   		goto out2;
 	}
@@ -790,7 +790,7 @@ cs89x0_probe1(struct net_device *dev, int ioaddr, int modular)
 		goto out3;
 	return 0;
 out3:
-	outw(PP_ChipID, dev->base_addr + ADD_PORT);
+	writeword(dev->base_addr, ADD_PORT, PP_ChipID);
 out2:
 	release_region(ioaddr & ~3, NETCARD_IO_EXTENT);
 out1:
@@ -970,11 +970,11 @@ void  __init reset_chip(struct net_device *dev)
 #ifndef CONFIG_ARCH_IXDP2X01
 	if (lp->chip_type != CS8900) {
 		/* Hardware problem requires PNP registers to be reconfigured after a reset */
-		outw(PP_CS8920_ISAINT, ioaddr + ADD_PORT);
+		writeword(ioaddr, ADD_PORT, PP_CS8920_ISAINT);
 		outb(dev->irq, ioaddr + DATA_PORT);
 		outb(0,      ioaddr + DATA_PORT + 1);
 
-		outw(PP_CS8920_ISAMemB, ioaddr + ADD_PORT);
+		writeword(ioaddr, ADD_PORT, PP_CS8920_ISAMemB);
 		outb((dev->mem_start >> 16) & 0xff, ioaddr + DATA_PORT);
 		outb((dev->mem_start >> 8) & 0xff,   ioaddr + DATA_PORT + 1);
 	}
@@ -1606,8 +1606,8 @@ net_rx(struct net_device *dev)
 	int status, length;
 
 	int ioaddr = dev->base_addr;
-	status = inw(ioaddr + RX_FRAME_PORT);
-	length = inw(ioaddr + RX_FRAME_PORT);
+	status = readword(ioaddr, RX_FRAME_PORT);
+	length = readword(ioaddr, RX_FRAME_PORT);
 
 	if ((status & RX_OK) == 0) {
 		count_rx_errors(status, lp);
@@ -1628,7 +1628,7 @@ net_rx(struct net_device *dev)
 
 	insw(ioaddr + RX_FRAME_PORT, skb_put(skb, length), length >> 1);
 	if (length & 1)
-		skb->data[length-1] = inw(ioaddr + RX_FRAME_PORT);
+		skb->data[length-1] = readword(ioaddr, RX_FRAME_PORT);
 
 	if (net_debug > 3) {
 		printk(	"%s: received %d byte packet of type %x\n",
@@ -1901,7 +1901,7 @@ void
 cleanup_module(void)
 {
 	unregister_netdev(dev_cs89x0);
-	outw(PP_ChipID, dev_cs89x0->base_addr + ADD_PORT);
+	writeword(dev_cs89x0->base_addr, ADD_PORT, PP_ChipID);
 	release_region(dev_cs89x0->base_addr, NETCARD_IO_EXTENT);
 	free_netdev(dev_cs89x0);
 }
-- 
cgit v1.2.2


From 0d5affcfe23ade8c4c01154e7e381e2a9a916399 Mon Sep 17 00:00:00 2001
From: Lennert Buytenhek <buytenh@wantstofly.org>
Date: Sun, 8 Jan 2006 01:01:09 -0800
Subject: [PATCH] cs89x0: swap {read,write}reg and {read,write}word

Reverse the order of readreg/writereg and readword/writeword in the
file, so that we can make readreg/writereg use readword/writeword.

Signed-off-by: Lennert Buytenhek <buytenh@wantstofly.org>
Cc: dmitry pervushin <dpervushin@ru.mvista.com>
Cc: <dsaxena@plexity.net>
Cc: Jeff Garzik <jgarzik@pobox.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/net/cs89x0.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/net/cs89x0.c b/drivers/net/cs89x0.c
index 04fca31f17..221f92e45e 100644
--- a/drivers/net/cs89x0.c
+++ b/drivers/net/cs89x0.c
@@ -339,29 +339,29 @@ out:
 #endif
 
 static int
-readreg(struct net_device *dev, int portno)
+readword(unsigned long base_addr, int portno)
 {
-	outw(portno, dev->base_addr + ADD_PORT);
-	return inw(dev->base_addr + DATA_PORT);
+	return inw(base_addr + portno);
 }
 
 static void
-writereg(struct net_device *dev, int portno, int value)
+writeword(unsigned long base_addr, int portno, int value)
 {
-	outw(portno, dev->base_addr + ADD_PORT);
-	outw(value, dev->base_addr + DATA_PORT);
+	outw(value, base_addr + portno);
 }
 
 static int
-readword(unsigned long base_addr, int portno)
+readreg(struct net_device *dev, int portno)
 {
-	return inw(base_addr + portno);
+	outw(portno, dev->base_addr + ADD_PORT);
+	return inw(dev->base_addr + DATA_PORT);
 }
 
 static void
-writeword(unsigned long base_addr, int portno, int value)
+writereg(struct net_device *dev, int portno, int value)
 {
-	outw(value, base_addr + portno);
+	outw(portno, dev->base_addr + ADD_PORT);
+	outw(value, dev->base_addr + DATA_PORT);
 }
 
 static int __init
-- 
cgit v1.2.2


From 3eaa5e7dcce0653d2bfd2ab85a623687da49f8d5 Mon Sep 17 00:00:00 2001
From: Lennert Buytenhek <buytenh@wantstofly.org>
Date: Sun, 8 Jan 2006 01:01:10 -0800
Subject: [PATCH] cs89x0: make {read,write}reg use {read,write}word

Make readreg/writereg use readword/writeword.

Signed-off-by: Lennert Buytenhek <buytenh@wantstofly.org>
Cc: dmitry pervushin <dpervushin@ru.mvista.com>
Cc: <dsaxena@plexity.net>
Cc: Jeff Garzik <jgarzik@pobox.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/net/cs89x0.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/net/cs89x0.c b/drivers/net/cs89x0.c
index 221f92e45e..756d80adc4 100644
--- a/drivers/net/cs89x0.c
+++ b/drivers/net/cs89x0.c
@@ -351,17 +351,17 @@ writeword(unsigned long base_addr, int portno, int value)
 }
 
 static int
-readreg(struct net_device *dev, int portno)
+readreg(struct net_device *dev, int regno)
 {
-	outw(portno, dev->base_addr + ADD_PORT);
-	return inw(dev->base_addr + DATA_PORT);
+	writeword(dev->base_addr, ADD_PORT, regno);
+	return readword(dev->base_addr, DATA_PORT);
 }
 
 static void
-writereg(struct net_device *dev, int portno, int value)
+writereg(struct net_device *dev, int regno, int value)
 {
-	outw(portno, dev->base_addr + ADD_PORT);
-	outw(value, dev->base_addr + DATA_PORT);
+	writeword(dev->base_addr, ADD_PORT, regno);
+	writeword(dev->base_addr, DATA_PORT, value);
 }
 
 static int __init
-- 
cgit v1.2.2


From 3b68d70dffe255e7681d5725d96bc2b92a24bb9d Mon Sep 17 00:00:00 2001
From: Lennert Buytenhek <buytenh@wantstofly.org>
Date: Sun, 8 Jan 2006 01:01:11 -0800
Subject: [PATCH] cs89x0: cleanly implement ixdp2x01 and pnx0501 support

Implement suitable versions of the readword/writeword macros for ixdp2x01 and
pnx0501.  Handle the 32-bit spacing of the registers in these functions
instead of in the header file.

Signed-off-by: Lennert Buytenhek <buytenh@wantstofly.org>
Cc: dmitry pervushin <dpervushin@ru.mvista.com>
Cc: <dsaxena@plexity.net>
Cc: Jeff Garzik <jgarzik@pobox.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/net/cs89x0.c | 28 ++++++++++++++++++++++++++++
 drivers/net/cs89x0.h | 19 ++++++-------------
 2 files changed, 34 insertions(+), 13 deletions(-)

diff --git a/drivers/net/cs89x0.c b/drivers/net/cs89x0.c
index 756d80adc4..7abc9f858f 100644
--- a/drivers/net/cs89x0.c
+++ b/drivers/net/cs89x0.c
@@ -338,6 +338,32 @@ out:
 }
 #endif
 
+#if defined(CONFIG_ARCH_IXDP2X01)
+static int
+readword(unsigned long base_addr, int portno)
+{
+	return (u16)__raw_readl(base_addr + (portno << 1));
+}
+
+static void
+writeword(unsigned long base_addr, int portno, int value)
+{
+	__raw_writel((u16)value, base_addr + (portno << 1));
+}
+#else
+#if defined(CONFIG_ARCH_PNX0501)
+static int
+readword(unsigned long base_addr, int portno)
+{
+	return inw(base_addr + (portno << 1));
+}
+
+static void
+writeword(unsigned long base_addr, int portno, int value)
+{
+	outw(value, base_addr + (portno << 1));
+}
+#else
 static int
 readword(unsigned long base_addr, int portno)
 {
@@ -349,6 +375,8 @@ writeword(unsigned long base_addr, int portno, int value)
 {
 	outw(value, base_addr + portno);
 }
+#endif
+#endif
 
 static int
 readreg(struct net_device *dev, int regno)
diff --git a/drivers/net/cs89x0.h b/drivers/net/cs89x0.h
index decea264f1..bd954aaa63 100644
--- a/drivers/net/cs89x0.h
+++ b/drivers/net/cs89x0.h
@@ -16,13 +16,6 @@
 
 #include <linux/config.h>
 
-#if defined(CONFIG_ARCH_IXDP2X01) || defined(CONFIG_ARCH_PNX0105)
-/* IXDP2401/IXDP2801 uses dword-aligned register addressing */
-#define CS89x0_PORT(reg) ((reg) * 2)
-#else
-#define CS89x0_PORT(reg) (reg)
-#endif
-
 #define PP_ChipID 0x0000	/* offset   0h -> Corp -ID              */
 				/* offset   2h -> Model/Product Number  */
 				/* offset   3h -> Chip Revision Number  */
@@ -332,16 +325,16 @@
 #define RAM_SIZE	0x1000       /*  The card has 4k bytes or RAM */
 #define PKT_START PP_TxFrame  /*  Start of packet RAM */
 
-#define RX_FRAME_PORT	CS89x0_PORT(0x0000)
+#define RX_FRAME_PORT	0x0000
 #define TX_FRAME_PORT RX_FRAME_PORT
-#define TX_CMD_PORT	CS89x0_PORT(0x0004)
+#define TX_CMD_PORT	0x0004
 #define TX_NOW		0x0000       /*  Tx packet after   5 bytes copied */
 #define TX_AFTER_381	0x0040       /*  Tx packet after 381 bytes copied */
 #define TX_AFTER_ALL	0x00c0       /*  Tx packet after all bytes copied */
-#define TX_LEN_PORT	CS89x0_PORT(0x0006)
-#define ISQ_PORT	CS89x0_PORT(0x0008)
-#define ADD_PORT	CS89x0_PORT(0x000A)
-#define DATA_PORT	CS89x0_PORT(0x000C)
+#define TX_LEN_PORT	0x0006
+#define ISQ_PORT	0x0008
+#define ADD_PORT	0x000A
+#define DATA_PORT	0x000C
 
 #define EEPROM_WRITE_EN		0x00F0
 #define EEPROM_WRITE_DIS	0x0000
-- 
cgit v1.2.2


From 084f746a01ca026920e388e76e913cc7a26d5a3f Mon Sep 17 00:00:00 2001
From: Lennert Buytenhek <buytenh@wantstofly.org>
Date: Sun, 8 Jan 2006 01:01:12 -0800
Subject: [PATCH] cs89x0: switch {in,out}sw to {read,write}words

Implement readwords/writewords that use readword/writeword, and switch the
rest of the driver over to use these.

Signed-off-by: Lennert Buytenhek <buytenh@wantstofly.org>
Cc: dmitry pervushin <dpervushin@ru.mvista.com>
Cc: <dsaxena@plexity.net>
Cc: Jeff Garzik <jgarzik@pobox.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/net/cs89x0.c | 34 +++++++++++++++++++++++++++++++---
 1 file changed, 31 insertions(+), 3 deletions(-)

diff --git a/drivers/net/cs89x0.c b/drivers/net/cs89x0.c
index 7abc9f858f..f7ec590e80 100644
--- a/drivers/net/cs89x0.c
+++ b/drivers/net/cs89x0.c
@@ -378,6 +378,34 @@ writeword(unsigned long base_addr, int portno, int value)
 #endif
 #endif
 
+static void
+readwords(unsigned long base_addr, int portno, void *buf, int length)
+{
+	u8 *buf8 = (u8 *)buf;
+
+	do {
+		u32 tmp32;
+
+		tmp32 = readword(base_addr, portno);
+		*buf8++ = (u8)tmp32;
+		*buf8++ = (u8)(tmp32 >> 8);
+	} while (--length);
+}
+
+static void
+writewords(unsigned long base_addr, int portno, void *buf, int length)
+{
+	u8 *buf8 = (u8 *)buf;
+
+	do {
+		u32 tmp32;
+
+		tmp32 = *buf8++;
+		tmp32 |= (*buf8++) << 8;
+		writeword(base_addr, portno, tmp32);
+	} while (--length);
+}
+
 static int
 readreg(struct net_device *dev, int regno)
 {
@@ -1143,7 +1171,7 @@ send_test_pkt(struct net_device *dev)
 		return 0;	/* this shouldn't happen */
 
 	/* Write the contents of the packet */
-	outsw(dev->base_addr + TX_FRAME_PORT,test_packet,(ETH_ZLEN+1) >>1);
+	writewords(dev->base_addr, TX_FRAME_PORT,test_packet,(ETH_ZLEN+1) >>1);
 
 	if (net_debug > 1) printk("Sending test packet ");
 	/* wait a couple of jiffies for packet to be received */
@@ -1500,7 +1528,7 @@ static int net_send_packet(struct sk_buff *skb, struct net_device *dev)
 		return 1;
 	}
 	/* Write the contents of the packet */
-	outsw(dev->base_addr + TX_FRAME_PORT,skb->data,(skb->len+1) >>1);
+	writewords(dev->base_addr, TX_FRAME_PORT,skb->data,(skb->len+1) >>1);
 	spin_unlock_irq(&lp->lock);
 	lp->stats.tx_bytes += skb->len;
 	dev->trans_start = jiffies;
@@ -1654,7 +1682,7 @@ net_rx(struct net_device *dev)
 	skb_reserve(skb, 2);	/* longword align L3 header */
 	skb->dev = dev;
 
-	insw(ioaddr + RX_FRAME_PORT, skb_put(skb, length), length >> 1);
+	readwords(ioaddr, RX_FRAME_PORT, skb_put(skb, length), length >> 1);
 	if (length & 1)
 		skb->data[length-1] = readword(ioaddr, RX_FRAME_PORT);
 
-- 
cgit v1.2.2


From 37610ff3cb63f59b89370681ff780687d8a7bed3 Mon Sep 17 00:00:00 2001
From: Lennert Buytenhek <buytenh@wantstofly.org>
Date: Sun, 8 Jan 2006 01:01:13 -0800
Subject: [PATCH] fix Kconfig depends for cs89x0 (PNX010X support)

PNX010X support for CS89x0 should be conditional on NET_PCI, as it is an 'on
board controller' and NET_PCI includes that category of NICs.  Since
ARCH_PNX0105 was recently changed to ARCH_PNX010X, incorporate that change as
well while we're at it.

Signed-off-by: Lennert Buytenhek <buytenh@wantstofly.org>
Cc: dmitry pervushin <dpervushin@ru.mvista.com>
Cc: <dsaxena@plexity.net>
Cc: Jeff Garzik <jgarzik@pobox.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/net/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index e2fa29b612..1960961bf2 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -1374,7 +1374,7 @@ config FORCEDETH
 
 config CS89x0
 	tristate "CS89x0 support"
-	depends on (NET_PCI && (ISA || ARCH_IXDP2X01)) || ARCH_PNX0105
+	depends on NET_PCI && (ISA || ARCH_IXDP2X01 || ARCH_PNX010X)
 	---help---
 	  Support for CS89x0 chipset based Ethernet cards. If you have a
 	  network (Ethernet) card of this type, say Y and read the
-- 
cgit v1.2.2


From 277cb103e3d7b31b8f4941b6a495b1b80236b05c Mon Sep 17 00:00:00 2001
From: Lennert Buytenhek <buytenh@wantstofly.org>
Date: Sun, 8 Jan 2006 01:01:14 -0800
Subject: [PATCH] cs89x0: fix up after pnx0105 Kconfig symbol renaming

The Kconfig symbol for pnx0105 was recently renamed to ARCH_PNX010X.

Signed-off-by: Lennert Buytenhek <buytenh@wantstofly.org>
Cc: dmitry pervushin <dpervushin@ru.mvista.com>
Cc: <dsaxena@plexity.net>
Cc: Jeff Garzik <jgarzik@pobox.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/net/cs89x0.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/net/cs89x0.c b/drivers/net/cs89x0.c
index f7ec590e80..907c010097 100644
--- a/drivers/net/cs89x0.c
+++ b/drivers/net/cs89x0.c
@@ -175,7 +175,7 @@ static unsigned int cs8900_irq_map[] = {1,0,0,0};
 #include <asm/irq.h>
 static unsigned int netcard_portlist[] __initdata = {IXDP2X01_CS8900_VIRT_BASE, 0};
 static unsigned int cs8900_irq_map[] = {IRQ_IXDP2X01_CS8900, 0, 0, 0};
-#elif defined(CONFIG_ARCH_PNX0105)
+#elif defined(CONFIG_ARCH_PNX010X)
 #include <asm/irq.h>
 #include <asm/arch/gpio.h>
 #define CIRRUS_DEFAULT_BASE	IO_ADDRESS(EXT_STATIC2_s0_BASE + 0x200000)	/* = Physical address 0x48200000 */
@@ -351,7 +351,7 @@ writeword(unsigned long base_addr, int portno, int value)
 	__raw_writel((u16)value, base_addr + (portno << 1));
 }
 #else
-#if defined(CONFIG_ARCH_PNX0501)
+#if defined(CONFIG_ARCH_PNX010X)
 static int
 readword(unsigned long base_addr, int portno)
 {
@@ -512,7 +512,7 @@ cs89x0_probe1(struct net_device *dev, int ioaddr, int modular)
 #endif
         }
 
-#ifdef CONFIG_ARCH_PNX0105
+#ifdef CONFIG_ARCH_PNX010X
 	initialize_ebi();
 
 	/* Map GPIO registers for the pins connected to the CS8900a. */
@@ -780,7 +780,7 @@ cs89x0_probe1(struct net_device *dev, int ioaddr, int modular)
 	} else {
 		i = lp->isa_config & INT_NO_MASK;
 		if (lp->chip_type == CS8900) {
-#if defined(CONFIG_ARCH_IXDP2X01) || defined(CONFIG_ARCH_PNX0105)
+#if defined(CONFIG_ARCH_IXDP2X01) || defined(CONFIG_ARCH_PNX010X)
 		        i = cs8900_irq_map[0];
 #else
 			/* Translate the IRQ using the IRQ mapping table. */
@@ -1256,7 +1256,7 @@ net_open(struct net_device *dev)
 	int i;
 	int ret;
 
-#if !defined(CONFIG_SH_HICOSH4) && !defined(CONFIG_ARCH_PNX0105) /* uses irq#1, so this won't work */
+#if !defined(CONFIG_SH_HICOSH4) && !defined(CONFIG_ARCH_PNX010X) /* uses irq#1, so this won't work */
 	if (dev->irq < 2) {
 		/* Allow interrupts to be generated by the chip */
 /* Cirrus' release had this: */
@@ -1287,7 +1287,7 @@ net_open(struct net_device *dev)
 	else
 #endif
 	{
-#if !defined(CONFIG_ARCH_IXDP2X01) && !defined(CONFIG_ARCH_PNX0105)
+#if !defined(CONFIG_ARCH_IXDP2X01) && !defined(CONFIG_ARCH_PNX010X)
 		if (((1 << dev->irq) & lp->irq_map) == 0) {
 			printk(KERN_ERR "%s: IRQ %d is not in our map of allowable IRQs, which is %x\n",
                                dev->name, dev->irq, lp->irq_map);
@@ -1372,7 +1372,7 @@ net_open(struct net_device *dev)
 	case A_CNF_MEDIA_10B_2: result = lp->adapter_cnf & A_CNF_10B_2; break;
         default: result = lp->adapter_cnf & (A_CNF_10B_T | A_CNF_AUI | A_CNF_10B_2);
         }
-#ifdef CONFIG_ARCH_PNX0105
+#ifdef CONFIG_ARCH_PNX010X
 	result = A_CNF_10B_T;
 #endif
         if (!result) {
-- 
cgit v1.2.2


From 2919b51075b3906c2f476e5a932244af1947bf80 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sun, 8 Jan 2006 01:01:16 -0800
Subject: [PATCH] frv: suppress configuration of certain features for FRV

Suppress configuration of certain features for the FRV arch as they can't be
built for FRV at the moment:

 (*) RTC

 (*) HISAX_*

 (*) PARPORT_PC

 (*) VGA_CONSOLE

 (*) BINFMT_ELF

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/Kconfig          |  4 ++--
 drivers/isdn/hisax/Kconfig    | 10 +++++-----
 drivers/parport/Kconfig       |  2 +-
 drivers/video/console/Kconfig |  2 +-
 fs/Kconfig.binfmt             |  2 +-
 5 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index 5ebd06b1b4..b1b34edcd7 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -687,7 +687,7 @@ config NVRAM
 
 config RTC
 	tristate "Enhanced Real Time Clock Support"
-	depends on !PPC32 && !PARISC && !IA64 && !M68K && (!SPARC || PCI)
+	depends on !PPC32 && !PARISC && !IA64 && !M68K && (!SPARC || PCI) && !FRV
 	---help---
 	  If you say Y here and create a character special file /dev/rtc with
 	  major number 10 and minor number 135 using mknod ("man mknod"), you
@@ -735,7 +735,7 @@ config SGI_IP27_RTC
 
 config GEN_RTC
 	tristate "Generic /dev/rtc emulation"
-	depends on RTC!=y && !IA64 && !ARM && !M32R && !SPARC
+	depends on RTC!=y && !IA64 && !ARM && !M32R && !SPARC && !FRV
 	---help---
 	  If you say Y here and create a character special file /dev/rtc with
 	  major number 10 and minor number 135 using mknod ("man mknod"), you
diff --git a/drivers/isdn/hisax/Kconfig b/drivers/isdn/hisax/Kconfig
index c82105920d..0ef560144b 100644
--- a/drivers/isdn/hisax/Kconfig
+++ b/drivers/isdn/hisax/Kconfig
@@ -110,7 +110,7 @@ config HISAX_16_3
 
 config HISAX_TELESPCI
 	bool "Teles PCI"
-	depends on PCI && (BROKEN || !(SPARC || PPC || PARISC || M68K))
+	depends on PCI && (BROKEN || !(SPARC || PPC || PARISC || M68K || FRV))
 	help
 	  This enables HiSax support for the Teles PCI.
 	  See <file:Documentation/isdn/README.HiSax> on how to configure it.
@@ -238,7 +238,7 @@ config HISAX_MIC
 
 config HISAX_NETJET
 	bool "NETjet card"
-	depends on PCI && (BROKEN || !(SPARC || PPC || PARISC || M68K))
+	depends on PCI && (BROKEN || !(SPARC || PPC || PARISC || M68K || FRV))
 	help
 	  This enables HiSax support for the NetJet from Traverse
 	  Technologies.
@@ -249,7 +249,7 @@ config HISAX_NETJET
 
 config HISAX_NETJET_U
 	bool "NETspider U card"
-	depends on PCI && (BROKEN || !(SPARC || PPC || PARISC || M68K))
+	depends on PCI && (BROKEN || !(SPARC || PPC || PARISC || M68K || FRV))
 	help
 	  This enables HiSax support for the Netspider U interface ISDN card
 	  from Traverse Technologies.
@@ -317,7 +317,7 @@ config HISAX_GAZEL
 
 config HISAX_HFC_PCI
 	bool "HFC PCI-Bus cards"
-	depends on PCI && (BROKEN || !(SPARC || PPC || PARISC || M68K))
+	depends on PCI && (BROKEN || !(SPARC || PPC || PARISC || M68K || FRV))
 	help
 	  This enables HiSax support for the HFC-S PCI 2BDS0 based cards.
 
@@ -344,7 +344,7 @@ config HISAX_HFC_SX
 
 config HISAX_ENTERNOW_PCI
 	bool "Formula-n enter:now PCI card"
-	depends on PCI && (BROKEN || !(SPARC || PPC || PARISC || M68K))
+	depends on PCI && (BROKEN || !(SPARC || PPC || PARISC || M68K || FRV))
 	help
 	  This enables HiSax support for the Formula-n enter:now PCI
 	  ISDN card.
diff --git a/drivers/parport/Kconfig b/drivers/parport/Kconfig
index b8241561da..a665951b15 100644
--- a/drivers/parport/Kconfig
+++ b/drivers/parport/Kconfig
@@ -34,7 +34,7 @@ config PARPORT
 
 config PARPORT_PC
 	tristate "PC-style hardware"
-	depends on PARPORT && (!SPARC64 || PCI) && !SPARC32 && !M32R
+	depends on PARPORT && (!SPARC64 || PCI) && !SPARC32 && !M32R && !FRV
 	---help---
 	  You should say Y here if you have a PC-style parallel port. All
 	  IBM PC compatible computers and some Alphas have PC-style
diff --git a/drivers/video/console/Kconfig b/drivers/video/console/Kconfig
index a5d09e159c..6ee449858a 100644
--- a/drivers/video/console/Kconfig
+++ b/drivers/video/console/Kconfig
@@ -6,7 +6,7 @@ menu "Console display driver support"
 
 config VGA_CONSOLE
 	bool "VGA text console" if EMBEDDED || !X86
-	depends on !ARCH_ACORN && !ARCH_EBSA110 && !4xx && !8xx && !SPARC && !M68K && !PARISC && !ARCH_VERSATILE
+	depends on !ARCH_ACORN && !ARCH_EBSA110 && !4xx && !8xx && !SPARC && !M68K && !PARISC && !FRV && !ARCH_VERSATILE
 	default y
 	help
 	  Saying Y here will allow you to use Linux in text mode through a
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 175b2e8177..f3d3d81eb7 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -1,6 +1,6 @@
 config BINFMT_ELF
 	bool "Kernel support for ELF binaries"
-	depends on MMU
+	depends on MMU && (BROKEN || !FRV)
 	default y
 	---help---
 	  ELF (Executable and Linkable Format) is a format for libraries and
-- 
cgit v1.2.2


From 2fa9e7e2dce6aafecc1890461bdc00d78897be8b Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sun, 8 Jan 2006 01:01:17 -0800
Subject: [PATCH] frv: drop 8/16-bit xchg and cmpxchg

Drop support for 8-bit and 16-bit xchg and cmpxchg emulation and implements
32-bit xchg with the SWAP/SWAPI instruction.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/frv/lib/atomic-ops.S | 92 ---------------------------------------------
 include/asm-frv/atomic.h  | 96 +++--------------------------------------------
 2 files changed, 5 insertions(+), 183 deletions(-)

diff --git a/arch/frv/lib/atomic-ops.S b/arch/frv/lib/atomic-ops.S
index b03d510a89..545cd325ac 100644
--- a/arch/frv/lib/atomic-ops.S
+++ b/arch/frv/lib/atomic-ops.S
@@ -127,48 +127,6 @@ atomic_sub_return:
 
 	.size		atomic_sub_return, .-atomic_sub_return
 
-###############################################################################
-#
-# uint8_t __xchg_8(uint8_t i, uint8_t *v)
-#
-###############################################################################
-	.globl		__xchg_8
-        .type		__xchg_8,@function
-__xchg_8:
-	or.p		gr8,gr8,gr10
-0:
-	orcc		gr0,gr0,gr0,icc3		/* set ICC3.Z */
-	ckeq		icc3,cc7
-	ldub.p		@(gr9,gr0),gr8			/* LD.P/ORCR must be atomic */
-	orcr		cc7,cc7,cc3			/* set CC3 to true */
-	cstb.p		gr10,@(gr9,gr0)		,cc3,#1
-	corcc		gr29,gr29,gr0		,cc3,#1	/* clear ICC3.Z if store happens */
-	beq		icc3,#0,0b
-	bralr
-
-	.size		__xchg_8, .-__xchg_8
-
-###############################################################################
-#
-# uint16_t __xchg_16(uint16_t i, uint16_t *v)
-#
-###############################################################################
-	.globl		__xchg_16
-        .type		__xchg_16,@function
-__xchg_16:
-	or.p		gr8,gr8,gr10
-0:
-	orcc		gr0,gr0,gr0,icc3		/* set ICC3.Z */
-	ckeq		icc3,cc7
-	lduh.p		@(gr9,gr0),gr8			/* LD.P/ORCR must be atomic */
-	orcr		cc7,cc7,cc3			/* set CC3 to true */
-	csth.p		gr10,@(gr9,gr0)		,cc3,#1
-	corcc		gr29,gr29,gr0		,cc3,#1	/* clear ICC3.Z if store happens */
-	beq		icc3,#0,0b
-	bralr
-
-	.size		__xchg_16, .-__xchg_16
-
 ###############################################################################
 #
 # uint32_t __xchg_32(uint32_t i, uint32_t *v)
@@ -190,56 +148,6 @@ __xchg_32:
 
 	.size		__xchg_32, .-__xchg_32
 
-###############################################################################
-#
-# uint8_t __cmpxchg_8(uint8_t *v, uint8_t test, uint8_t new)
-#
-###############################################################################
-	.globl		__cmpxchg_8
-        .type		__cmpxchg_8,@function
-__cmpxchg_8:
-	or.p		gr8,gr8,gr11
-0:
-	orcc		gr0,gr0,gr0,icc3
-	ckeq		icc3,cc7
-	ldub.p		@(gr11,gr0),gr8
-	orcr		cc7,cc7,cc3
-	sub		gr8,gr9,gr7
-	sllicc		gr7,#24,gr0,icc0
-	bne		icc0,#0,1f
-	cstb.p		gr10,@(gr11,gr0)	,cc3,#1
-	corcc		gr29,gr29,gr0		,cc3,#1
-	beq		icc3,#0,0b
-1:
-	bralr
-
-	.size		__cmpxchg_8, .-__cmpxchg_8
-
-###############################################################################
-#
-# uint16_t __cmpxchg_16(uint16_t *v, uint16_t test, uint16_t new)
-#
-###############################################################################
-	.globl		__cmpxchg_16
-        .type		__cmpxchg_16,@function
-__cmpxchg_16:
-	or.p		gr8,gr8,gr11
-0:
-	orcc		gr0,gr0,gr0,icc3
-	ckeq		icc3,cc7
-	lduh.p		@(gr11,gr0),gr8
-	orcr		cc7,cc7,cc3
-	sub		gr8,gr9,gr7
-	sllicc		gr7,#16,gr0,icc0
-	bne		icc0,#0,1f
-	csth.p		gr10,@(gr11,gr0)	,cc3,#1
-	corcc		gr29,gr29,gr0		,cc3,#1
-	beq		icc3,#0,0b
-1:
-	bralr
-
-	.size		__cmpxchg_16, .-__cmpxchg_16
-
 ###############################################################################
 #
 # uint32_t __cmpxchg_32(uint32_t *v, uint32_t test, uint32_t new)
diff --git a/include/asm-frv/atomic.h b/include/asm-frv/atomic.h
index 3f54fea2b0..9c9e9499cf 100644
--- a/include/asm-frv/atomic.h
+++ b/include/asm-frv/atomic.h
@@ -218,51 +218,12 @@ extern unsigned long atomic_test_and_XOR_mask(unsigned long mask, volatile unsig
 	__typeof__(*(ptr)) __xg_orig;						\
 										\
 	switch (sizeof(__xg_orig)) {						\
-	case 1:									\
-		asm volatile(							\
-			"0:						\n"	\
-			"	orcc		gr0,gr0,gr0,icc3	\n"	\
-			"	ckeq		icc3,cc7		\n"	\
-			"	ldub.p		%M0,%1			\n"	\
-			"	orcr		cc7,cc7,cc3		\n"	\
-			"	cstb.p		%2,%M0		,cc3,#1	\n"	\
-			"	corcc		gr29,gr29,gr0	,cc3,#1	\n"	\
-			"	beq		icc3,#0,0b		\n"	\
-			: "+U"(*__xg_ptr), "=&r"(__xg_orig)			\
-			: "r"(x)						\
-			: "memory", "cc7", "cc3", "icc3"			\
-			);							\
-		break;								\
-										\
-	case 2:									\
-		asm volatile(							\
-			"0:						\n"	\
-			"	orcc		gr0,gr0,gr0,icc3	\n"	\
-			"	ckeq		icc3,cc7		\n"	\
-			"	lduh.p		%M0,%1			\n"	\
-			"	orcr		cc7,cc7,cc3		\n"	\
-			"	csth.p		%2,%M0		,cc3,#1	\n"	\
-			"	corcc		gr29,gr29,gr0	,cc3,#1	\n"	\
-			"	beq		icc3,#0,0b		\n"	\
-			: "+U"(*__xg_ptr), "=&r"(__xg_orig)			\
-			: "r"(x)						\
-			: "memory", "cc7", "cc3", "icc3"			\
-			);							\
-		break;								\
-										\
 	case 4:									\
 		asm volatile(							\
-			"0:						\n"	\
-			"	orcc		gr0,gr0,gr0,icc3	\n"	\
-			"	ckeq		icc3,cc7		\n"	\
-			"	ld.p		%M0,%1			\n"	\
-			"	orcr		cc7,cc7,cc3		\n"	\
-			"	cst.p		%2,%M0		,cc3,#1	\n"	\
-			"	corcc		gr29,gr29,gr0	,cc3,#1	\n"	\
-			"	beq		icc3,#0,0b		\n"	\
-			: "+U"(*__xg_ptr), "=&r"(__xg_orig)			\
+			"swap%I0 %2,%M0"					\
+			: "+m"(*__xg_ptr), "=&r"(__xg_orig)			\
 			: "r"(x)						\
-			: "memory", "cc7", "cc3", "icc3"			\
+			: "memory"						\
 			);							\
 		break;								\
 										\
@@ -277,8 +238,6 @@ extern unsigned long atomic_test_and_XOR_mask(unsigned long mask, volatile unsig
 
 #else
 
-extern uint8_t  __xchg_8 (uint8_t i,  volatile void *v);
-extern uint16_t __xchg_16(uint16_t i, volatile void *v);
 extern uint32_t __xchg_32(uint32_t i, volatile void *v);
 
 #define xchg(ptr, x)										\
@@ -287,8 +246,6 @@ extern uint32_t __xchg_32(uint32_t i, volatile void *v);
 	__typeof__(*(ptr)) __xg_orig;								\
 												\
 	switch (sizeof(__xg_orig)) {								\
-	case 1: __xg_orig = (__typeof__(*(ptr))) __xchg_8 ((uint8_t)  x, __xg_ptr);	break;	\
-	case 2: __xg_orig = (__typeof__(*(ptr))) __xchg_16((uint16_t) x, __xg_ptr);	break;	\
 	case 4: __xg_orig = (__typeof__(*(ptr))) __xchg_32((uint32_t) x, __xg_ptr);	break;	\
 	default:										\
 		__xg_orig = 0;									\
@@ -318,46 +275,6 @@ extern uint32_t __xchg_32(uint32_t i, volatile void *v);
 	__typeof__(*(ptr)) __xg_new = (new);					\
 										\
 	switch (sizeof(__xg_orig)) {						\
-	case 1:									\
-		asm volatile(							\
-			"0:						\n"	\
-			"	orcc		gr0,gr0,gr0,icc3	\n"	\
-			"	ckeq		icc3,cc7		\n"	\
-			"	ldub.p		%M0,%1			\n"	\
-			"	orcr		cc7,cc7,cc3		\n"	\
-			"	sub%I4		%1,%4,%2		\n"	\
-			"	sllcc		%2,#24,gr0,icc0		\n"	\
-			"	bne		icc0,#0,1f		\n"	\
-			"	cstb.p		%3,%M0		,cc3,#1	\n"	\
-			"	corcc		gr29,gr29,gr0	,cc3,#1	\n"	\
-			"	beq		icc3,#0,0b		\n"	\
-			"1:						\n"	\
-			: "+U"(*__xg_ptr), "=&r"(__xg_orig), "=&r"(__xg_tmp)	\
-			: "r"(__xg_new), "NPr"(__xg_test)			\
-			: "memory", "cc7", "cc3", "icc3", "icc0"		\
-			);							\
-		break;								\
-										\
-	case 2:									\
-		asm volatile(							\
-			"0:						\n"	\
-			"	orcc		gr0,gr0,gr0,icc3	\n"	\
-			"	ckeq		icc3,cc7		\n"	\
-			"	lduh.p		%M0,%1			\n"	\
-			"	orcr		cc7,cc7,cc3		\n"	\
-			"	sub%I4		%1,%4,%2		\n"	\
-			"	sllcc		%2,#16,gr0,icc0		\n"	\
-			"	bne		icc0,#0,1f		\n"	\
-			"	csth.p		%3,%M0		,cc3,#1	\n"	\
-			"	corcc		gr29,gr29,gr0	,cc3,#1	\n"	\
-			"	beq		icc3,#0,0b		\n"	\
-			"1:						\n"	\
-			: "+U"(*__xg_ptr), "=&r"(__xg_orig), "=&r"(__xg_tmp)	\
-			: "r"(__xg_new), "NPr"(__xg_test)			\
-			: "memory", "cc7", "cc3", "icc3", "icc0"		\
-			);							\
-		break;								\
-										\
 	case 4:									\
 		asm volatile(							\
 			"0:						\n"	\
@@ -388,8 +305,6 @@ extern uint32_t __xchg_32(uint32_t i, volatile void *v);
 
 #else
 
-extern uint8_t  __cmpxchg_8 (uint8_t *v,  uint8_t test,  uint8_t new);
-extern uint16_t __cmpxchg_16(uint16_t *v, uint16_t test, uint16_t new);
 extern uint32_t __cmpxchg_32(uint32_t *v, uint32_t test, uint32_t new);
 
 #define cmpxchg(ptr, test, new)							\
@@ -400,8 +315,6 @@ extern uint32_t __cmpxchg_32(uint32_t *v, uint32_t test, uint32_t new);
 	__typeof__(*(ptr)) __xg_new = (new);					\
 										\
 	switch (sizeof(__xg_orig)) {						\
-	case 1: __xg_orig = __cmpxchg_8 (__xg_ptr, __xg_test, __xg_new); break;	\
-	case 2: __xg_orig = __cmpxchg_16(__xg_ptr, __xg_test, __xg_new); break;	\
 	case 4: __xg_orig = __cmpxchg_32(__xg_ptr, __xg_test, __xg_new); break;	\
 	default:								\
 		__xg_orig = 0;							\
@@ -414,7 +327,7 @@ extern uint32_t __cmpxchg_32(uint32_t *v, uint32_t test, uint32_t new);
 
 #endif
 
-#define atomic_cmpxchg(v, old, new) ((int)cmpxchg(&((v)->counter), old, new))
+#define atomic_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), old, new))
 
 #define atomic_add_unless(v, a, u)				\
 ({								\
@@ -424,6 +337,7 @@ extern uint32_t __cmpxchg_32(uint32_t *v, uint32_t test, uint32_t new);
 		c = old;					\
 	c != (u);						\
 })
+
 #define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)
 
 #include <asm-generic/atomic.h>
-- 
cgit v1.2.2


From 8369ce4cfe18decc3ea0afcf91e67c665479c78e Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sun, 8 Jan 2006 01:01:18 -0800
Subject: [PATCH] frv: drop unsupported debugging features

Drop support for debugging features that aren't supported on FRV:

 (*) EARLY_PRINTK

	The on-chip UARTs are set up early enough that this isn't required,
	and VGA support isn't available. There's also a gdbstub available.

 (*) DEBUG_PAGEALLOC

	This can't be easily be done since we use huge static mappings to
	cover the kernel, not pages.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/frv/Kconfig.debug | 22 ----------------------
 1 file changed, 22 deletions(-)

diff --git a/arch/frv/Kconfig.debug b/arch/frv/Kconfig.debug
index 0034b65499..211f01bc4c 100644
--- a/arch/frv/Kconfig.debug
+++ b/arch/frv/Kconfig.debug
@@ -2,32 +2,10 @@ menu "Kernel hacking"
 
 source "lib/Kconfig.debug"
 
-config EARLY_PRINTK
-	bool "Early printk"
-	depends on EMBEDDED && DEBUG_KERNEL
-	default n
-	help
-	  Write kernel log output directly into the VGA buffer or to a serial
-	  port.
-
-	  This is useful for kernel debugging when your machine crashes very
-	  early before the console code is initialized. For normal operation
-	  it is not recommended because it looks ugly and doesn't cooperate
-	  with klogd/syslogd or the X server. You should normally N here,
-	  unless you want to debug such a crash.
-
 config DEBUG_STACKOVERFLOW
 	bool "Check for stack overflows"
 	depends on DEBUG_KERNEL
 
-config DEBUG_PAGEALLOC
-	bool "Page alloc debugging"
-	depends on DEBUG_KERNEL
-	help
-	  Unmap pages from the kernel linear mapping after free_pages().
-	  This results in a large slowdown, but helps to find certain types
-	  of memory corruptions.
-
 config GDBSTUB
 	bool "Remote GDB kernel debugging"
 	depends on DEBUG_KERNEL
-- 
cgit v1.2.2


From 402344012ebe696d9353bbf056889ddaaec83079 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sun, 8 Jan 2006 01:01:19 -0800
Subject: [PATCH] frv: implement and export various things required by modules

Export a number of features required to build all the modules.  It also
implements the following simple features:

 (*) csum_partial_copy_from_user() for MMU as well as no-MMU.

 (*) __ucmpdi2().

so that they can be exported too.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/frv/kernel/frv_ksyms.c           | 25 +++++++++++++------
 arch/frv/kernel/irq.c                 | 17 +++++++++++++
 arch/frv/kernel/pm.c                  |  2 ++
 arch/frv/kernel/time.c                |  3 +++
 arch/frv/kernel/traps.c               |  3 +++
 arch/frv/kernel/uaccess.c             |  7 ++++++
 arch/frv/lib/Makefile                 |  2 +-
 arch/frv/lib/__ucmpdi2.S              | 45 +++++++++++++++++++++++++++++++++++
 arch/frv/lib/checksum.c               | 31 +++++++++++++++++++-----
 arch/frv/mb93090-mb00/pci-dma-nommu.c |  8 +++++++
 arch/frv/mb93090-mb00/pci-dma.c       | 10 ++++++++
 arch/frv/mm/cache-page.c              |  5 ++++
 arch/frv/mm/highmem.c                 |  8 +++++++
 lib/find_next_bit.c                   |  3 +++
 14 files changed, 155 insertions(+), 14 deletions(-)
 create mode 100644 arch/frv/lib/__ucmpdi2.S

diff --git a/arch/frv/kernel/frv_ksyms.c b/arch/frv/kernel/frv_ksyms.c
index 1a76d52471..5f118c89d0 100644
--- a/arch/frv/kernel/frv_ksyms.c
+++ b/arch/frv/kernel/frv_ksyms.c
@@ -16,10 +16,11 @@
 #include <asm/semaphore.h>
 #include <asm/checksum.h>
 #include <asm/hardirq.h>
-#include <asm/current.h>
+#include <asm/cacheflush.h>
 
 extern void dump_thread(struct pt_regs *, struct user *);
 extern long __memcpy_user(void *dst, const void *src, size_t count);
+extern long __memset_user(void *dst, const void *src, size_t count);
 
 /* platform dependent support */
 
@@ -50,7 +51,11 @@ EXPORT_SYMBOL(disable_irq);
 EXPORT_SYMBOL(__res_bus_clock_speed_HZ);
 EXPORT_SYMBOL(__page_offset);
 EXPORT_SYMBOL(__memcpy_user);
-EXPORT_SYMBOL(flush_dcache_page);
+EXPORT_SYMBOL(__memset_user);
+EXPORT_SYMBOL(frv_dcache_writeback);
+EXPORT_SYMBOL(frv_cache_invalidate);
+EXPORT_SYMBOL(frv_icache_invalidate);
+EXPORT_SYMBOL(frv_cache_wback_inv);
 
 #ifndef CONFIG_MMU
 EXPORT_SYMBOL(memory_start);
@@ -72,6 +77,9 @@ EXPORT_SYMBOL(memcmp);
 EXPORT_SYMBOL(memscan);
 EXPORT_SYMBOL(memmove);
 
+EXPORT_SYMBOL(__outsl_ns);
+EXPORT_SYMBOL(__insl_ns);
+
 EXPORT_SYMBOL(get_wchan);
 
 #ifdef CONFIG_FRV_OUTOFLINE_ATOMIC_OPS
@@ -80,14 +88,13 @@ EXPORT_SYMBOL(atomic_test_and_OR_mask);
 EXPORT_SYMBOL(atomic_test_and_XOR_mask);
 EXPORT_SYMBOL(atomic_add_return);
 EXPORT_SYMBOL(atomic_sub_return);
-EXPORT_SYMBOL(__xchg_8);
-EXPORT_SYMBOL(__xchg_16);
 EXPORT_SYMBOL(__xchg_32);
-EXPORT_SYMBOL(__cmpxchg_8);
-EXPORT_SYMBOL(__cmpxchg_16);
 EXPORT_SYMBOL(__cmpxchg_32);
 #endif
 
+EXPORT_SYMBOL(__debug_bug_printk);
+EXPORT_SYMBOL(__delay_loops_MHz);
+
 /*
  * libgcc functions - functions that are used internally by the
  * compiler...  (prototypes are not correct though, but that
@@ -101,6 +108,8 @@ extern void __divdi3(void);
 extern void __lshrdi3(void);
 extern void __moddi3(void);
 extern void __muldi3(void);
+extern void __mulll(void);
+extern void __umulll(void);
 extern void __negdi2(void);
 extern void __ucmpdi2(void);
 extern void __udivdi3(void);
@@ -116,8 +125,10 @@ EXPORT_SYMBOL(__ashrdi3);
 EXPORT_SYMBOL(__lshrdi3);
 //EXPORT_SYMBOL(__moddi3);
 EXPORT_SYMBOL(__muldi3);
+EXPORT_SYMBOL(__mulll);
+EXPORT_SYMBOL(__umulll);
 EXPORT_SYMBOL(__negdi2);
-//EXPORT_SYMBOL(__ucmpdi2);
+EXPORT_SYMBOL(__ucmpdi2);
 //EXPORT_SYMBOL(__udivdi3);
 //EXPORT_SYMBOL(__udivmoddi4);
 //EXPORT_SYMBOL(__umoddi3);
diff --git a/arch/frv/kernel/irq.c b/arch/frv/kernel/irq.c
index 8c524cdd27..59580c59c6 100644
--- a/arch/frv/kernel/irq.c
+++ b/arch/frv/kernel/irq.c
@@ -32,6 +32,7 @@
 #include <linux/irq.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
+#include <linux/module.h>
 
 #include <asm/atomic.h>
 #include <asm/io.h>
@@ -178,6 +179,8 @@ void disable_irq_nosync(unsigned int irq)
 	spin_unlock_irqrestore(&level->lock, flags);
 }
 
+EXPORT_SYMBOL(disable_irq_nosync);
+
 /**
  *	disable_irq - disable an irq and wait for completion
  *	@irq: Interrupt to disable
@@ -204,6 +207,8 @@ void disable_irq(unsigned int irq)
 #endif
 }
 
+EXPORT_SYMBOL(disable_irq);
+
 /**
  *	enable_irq - enable handling of an irq
  *	@irq: Interrupt to enable
@@ -268,6 +273,8 @@ void enable_irq(unsigned int irq)
 	spin_unlock_irqrestore(&level->lock, flags);
 }
 
+EXPORT_SYMBOL(enable_irq);
+
 /*****************************************************************************/
 /*
  * handles all normal device IRQ's
@@ -425,6 +432,8 @@ int request_irq(unsigned int irq,
 	return retval;
 }
 
+EXPORT_SYMBOL(request_irq);
+
 /**
  *	free_irq - free an interrupt
  *	@irq: Interrupt line to free
@@ -496,6 +505,8 @@ void free_irq(unsigned int irq, void *dev_id)
 	}
 }
 
+EXPORT_SYMBOL(free_irq);
+
 /*
  * IRQ autodetection code..
  *
@@ -519,6 +530,8 @@ unsigned long probe_irq_on(void)
 	return 0;
 }
 
+EXPORT_SYMBOL(probe_irq_on);
+
 /*
  * Return a mask of triggered interrupts (this
  * can handle only legacy ISA interrupts).
@@ -542,6 +555,8 @@ unsigned int probe_irq_mask(unsigned long xmask)
 	return 0;
 }
 
+EXPORT_SYMBOL(probe_irq_mask);
+
 /*
  * Return the one interrupt that triggered (this can
  * handle any interrupt source).
@@ -571,6 +586,8 @@ int probe_irq_off(unsigned long xmask)
 	return -1;
 }
 
+EXPORT_SYMBOL(probe_irq_off);
+
 /* this was setup_x86_irq but it seems pretty generic */
 int setup_irq(unsigned int irq, struct irqaction *new)
 {
diff --git a/arch/frv/kernel/pm.c b/arch/frv/kernel/pm.c
index 712c3c24c9..f0b8fff3e7 100644
--- a/arch/frv/kernel/pm.c
+++ b/arch/frv/kernel/pm.c
@@ -13,6 +13,7 @@
 
 #include <linux/config.h>
 #include <linux/init.h>
+#include <linux/module.h>
 #include <linux/pm.h>
 #include <linux/pm_legacy.h>
 #include <linux/sched.h>
@@ -27,6 +28,7 @@
 #include "local.h"
 
 void (*pm_power_off)(void);
+EXPORT_SYMBOL(pm_power_off);
 
 extern void frv_change_cmode(int);
 
diff --git a/arch/frv/kernel/time.c b/arch/frv/kernel/time.c
index 2e9741227b..24cf85f89e 100644
--- a/arch/frv/kernel/time.c
+++ b/arch/frv/kernel/time.c
@@ -189,6 +189,8 @@ void do_gettimeofday(struct timeval *tv)
 	tv->tv_usec = usec;
 }
 
+EXPORT_SYMBOL(do_gettimeofday);
+
 int do_settimeofday(struct timespec *tv)
 {
 	time_t wtm_sec, sec = tv->tv_sec;
@@ -218,6 +220,7 @@ int do_settimeofday(struct timespec *tv)
 	clock_was_set();
 	return 0;
 }
+
 EXPORT_SYMBOL(do_settimeofday);
 
 /*
diff --git a/arch/frv/kernel/traps.c b/arch/frv/kernel/traps.c
index 89073cae4b..9eb84b2e6a 100644
--- a/arch/frv/kernel/traps.c
+++ b/arch/frv/kernel/traps.c
@@ -19,6 +19,7 @@
 #include <linux/string.h>
 #include <linux/linkage.h>
 #include <linux/init.h>
+#include <linux/module.h>
 
 #include <asm/setup.h>
 #include <asm/fpu.h>
@@ -250,6 +251,8 @@ void dump_stack(void)
 	show_stack(NULL, NULL);
 }
 
+EXPORT_SYMBOL(dump_stack);
+
 void show_stack(struct task_struct *task, unsigned long *sp)
 {
 }
diff --git a/arch/frv/kernel/uaccess.c b/arch/frv/kernel/uaccess.c
index f3fd58a5bc..9b751c0f0e 100644
--- a/arch/frv/kernel/uaccess.c
+++ b/arch/frv/kernel/uaccess.c
@@ -10,6 +10,7 @@
  */
 
 #include <linux/mm.h>
+#include <linux/module.h>
 #include <asm/uaccess.h>
 
 /*****************************************************************************/
@@ -58,8 +59,11 @@ long strncpy_from_user(char *dst, const char *src, long count)
 		memset(p, 0, count); /* clear remainder of buffer [security] */
 
 	return err;
+
 } /* end strncpy_from_user() */
 
+EXPORT_SYMBOL(strncpy_from_user);
+
 /*****************************************************************************/
 /*
  * Return the size of a string (including the ending 0)
@@ -92,4 +96,7 @@ long strnlen_user(const char *src, long count)
 	}
 
 	return p - src + 1; /* return length including NUL */
+
 } /* end strnlen_user() */
+
+EXPORT_SYMBOL(strnlen_user);
diff --git a/arch/frv/lib/Makefile b/arch/frv/lib/Makefile
index 19be2626d5..08be305c9f 100644
--- a/arch/frv/lib/Makefile
+++ b/arch/frv/lib/Makefile
@@ -3,6 +3,6 @@
 #
 
 lib-y := \
-	__ashldi3.o __lshrdi3.o __muldi3.o __ashrdi3.o __negdi2.o \
+	__ashldi3.o __lshrdi3.o __muldi3.o __ashrdi3.o __negdi2.o __ucmpdi2.o \
 	checksum.o memcpy.o memset.o atomic-ops.o \
 	outsl_ns.o outsl_sw.o insl_ns.o insl_sw.o cache.o
diff --git a/arch/frv/lib/__ucmpdi2.S b/arch/frv/lib/__ucmpdi2.S
new file mode 100644
index 0000000000..d892f16ffa
--- /dev/null
+++ b/arch/frv/lib/__ucmpdi2.S
@@ -0,0 +1,45 @@
+/* __ucmpdi2.S: 64-bit unsigned compare
+ *
+ * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+
+        .text
+        .p2align	4
+
+###############################################################################
+#
+# int __ucmpdi2(unsigned long long a [GR8:GR9],
+#		unsigned long long b [GR10:GR11])
+#
+# - returns 0, 1, or 2 as a <, =, > b respectively.
+#
+###############################################################################
+        .globl		__ucmpdi2
+        .type		__ucmpdi2,@function
+__ucmpdi2:
+	or.p		gr8,gr0,gr4
+	subcc		gr8,gr10,gr0,icc0
+	setlos.p	#0,gr8
+	bclr		icc0,#2			; a.msw < b.msw
+
+	setlos.p	#2,gr8
+	bhilr		icc0,#0			; a.msw > b.msw
+
+	subcc.p		gr9,gr11,gr0,icc1
+	setlos		#0,gr8
+	setlos.p	#2,gr9
+	setlos		#1,gr7
+	cknc		icc1,cc6
+	cor.p		gr9,gr0,gr8,		cc6,#1
+	cckls		icc1,cc4,		cc6,#1
+	andcr		cc6,cc4,cc4
+	cor		gr7,gr0,gr8,		cc4,#1
+	bralr
+	.size		__ucmpdi2, .-__ucmpdi2
diff --git a/arch/frv/lib/checksum.c b/arch/frv/lib/checksum.c
index 7bf5bd6cac..20e7dfc474 100644
--- a/arch/frv/lib/checksum.c
+++ b/arch/frv/lib/checksum.c
@@ -33,6 +33,7 @@
 
 #include <net/checksum.h>
 #include <asm/checksum.h>
+#include <linux/module.h>
 
 static inline unsigned short from32to16(unsigned long x)
 {
@@ -115,34 +116,52 @@ unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
 	return result;
 }
 
+EXPORT_SYMBOL(csum_partial);
+
 /*
  * this routine is used for miscellaneous IP-like checksums, mainly
  * in icmp.c
  */
 unsigned short ip_compute_csum(const unsigned char * buff, int len)
 {
-	return ~do_csum(buff,len);
+	return ~do_csum(buff, len);
 }
 
+EXPORT_SYMBOL(ip_compute_csum);
+
 /*
  * copy from fs while checksumming, otherwise like csum_partial
  */
-
 unsigned int
-csum_partial_copy_from_user(const char *src, char *dst, int len, int sum, int *csum_err)
+csum_partial_copy_from_user(const char __user *src, char *dst,
+			    int len, int sum, int *csum_err)
 {
-	if (csum_err) *csum_err = 0;
-	memcpy(dst, src, len);
+	int rem;
+
+	if (csum_err)
+		*csum_err = 0;
+
+	rem = copy_from_user(dst, src, len);
+	if (rem != 0) {
+		if (csum_err)
+			*csum_err = -EFAULT;
+		memset(dst + len - rem, 0, rem);
+		len = rem;
+	}
+
 	return csum_partial(dst, len, sum);
 }
 
+EXPORT_SYMBOL(csum_partial_copy_from_user);
+
 /*
  * copy from ds while checksumming, otherwise like csum_partial
  */
-
 unsigned int
 csum_partial_copy(const char *src, char *dst, int len, int sum)
 {
 	memcpy(dst, src, len);
 	return csum_partial(dst, len, sum);
 }
+
+EXPORT_SYMBOL(csum_partial_copy);
diff --git a/arch/frv/mb93090-mb00/pci-dma-nommu.c b/arch/frv/mb93090-mb00/pci-dma-nommu.c
index 2082a9647f..4985466b1a 100644
--- a/arch/frv/mb93090-mb00/pci-dma-nommu.c
+++ b/arch/frv/mb93090-mb00/pci-dma-nommu.c
@@ -83,6 +83,8 @@ void *dma_alloc_coherent(struct device *hwdev, size_t size, dma_addr_t *dma_hand
 	return NULL;
 }
 
+EXPORT_SYMBOL(dma_alloc_coherent);
+
 void dma_free_coherent(struct device *hwdev, size_t size, void *vaddr, dma_addr_t dma_handle)
 {
 	struct dma_alloc_record *rec;
@@ -102,6 +104,8 @@ void dma_free_coherent(struct device *hwdev, size_t size, void *vaddr, dma_addr_
 	BUG();
 }
 
+EXPORT_SYMBOL(dma_free_coherent);
+
 /*
  * Map a single buffer of the indicated size for DMA in streaming mode.
  * The 32-bit bus address to use is returned.
@@ -120,6 +124,8 @@ dma_addr_t dma_map_single(struct device *dev, void *ptr, size_t size,
 	return virt_to_bus(ptr);
 }
 
+EXPORT_SYMBOL(dma_map_single);
+
 /*
  * Map a set of buffers described by scatterlist in streaming
  * mode for DMA.  This is the scather-gather version of the
@@ -150,3 +156,5 @@ int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
 
 	return nents;
 }
+
+EXPORT_SYMBOL(dma_map_sg);
diff --git a/arch/frv/mb93090-mb00/pci-dma.c b/arch/frv/mb93090-mb00/pci-dma.c
index 86fbdadc51..671ce1e843 100644
--- a/arch/frv/mb93090-mb00/pci-dma.c
+++ b/arch/frv/mb93090-mb00/pci-dma.c
@@ -28,11 +28,15 @@ void *dma_alloc_coherent(struct device *hwdev, size_t size, dma_addr_t *dma_hand
 	return ret;
 }
 
+EXPORT_SYMBOL(dma_alloc_coherent);
+
 void dma_free_coherent(struct device *hwdev, size_t size, void *vaddr, dma_addr_t dma_handle)
 {
 	consistent_free(vaddr);
 }
 
+EXPORT_SYMBOL(dma_free_coherent);
+
 /*
  * Map a single buffer of the indicated size for DMA in streaming mode.
  * The 32-bit bus address to use is returned.
@@ -51,6 +55,8 @@ dma_addr_t dma_map_single(struct device *dev, void *ptr, size_t size,
 	return virt_to_bus(ptr);
 }
 
+EXPORT_SYMBOL(dma_map_single);
+
 /*
  * Map a set of buffers described by scatterlist in streaming
  * mode for DMA.  This is the scather-gather version of the
@@ -96,6 +102,8 @@ int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
 	return nents;
 }
 
+EXPORT_SYMBOL(dma_map_sg);
+
 dma_addr_t dma_map_page(struct device *dev, struct page *page, unsigned long offset,
 			size_t size, enum dma_data_direction direction)
 {
@@ -103,3 +111,5 @@ dma_addr_t dma_map_page(struct device *dev, struct page *page, unsigned long off
 	flush_dcache_page(page);
 	return (dma_addr_t) page_to_phys(page) + offset;
 }
+
+EXPORT_SYMBOL(dma_map_page);
diff --git a/arch/frv/mm/cache-page.c b/arch/frv/mm/cache-page.c
index 683b5e3443..0261cbe153 100644
--- a/arch/frv/mm/cache-page.c
+++ b/arch/frv/mm/cache-page.c
@@ -11,6 +11,7 @@
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/highmem.h>
+#include <linux/module.h>
 #include <asm/pgalloc.h>
 
 /*****************************************************************************/
@@ -38,6 +39,8 @@ void flush_dcache_page(struct page *page)
 
 } /* end flush_dcache_page() */
 
+EXPORT_SYMBOL(flush_dcache_page);
+
 /*****************************************************************************/
 /*
  * ICI takes a virtual address and the page may not currently have one
@@ -64,3 +67,5 @@ void flush_icache_user_range(struct vm_area_struct *vma, struct page *page,
 	}
 
 } /* end flush_icache_user_range() */
+
+EXPORT_SYMBOL(flush_icache_user_range);
diff --git a/arch/frv/mm/highmem.c b/arch/frv/mm/highmem.c
index 7dc8fbf3af..7f77db7fab 100644
--- a/arch/frv/mm/highmem.c
+++ b/arch/frv/mm/highmem.c
@@ -9,6 +9,7 @@
  * 2 of the License, or (at your option) any later version.
  */
 #include <linux/highmem.h>
+#include <linux/module.h>
 
 void *kmap(struct page *page)
 {
@@ -18,6 +19,8 @@ void *kmap(struct page *page)
 	return kmap_high(page);
 }
 
+EXPORT_SYMBOL(kmap);
+
 void kunmap(struct page *page)
 {
 	if (in_interrupt())
@@ -27,7 +30,12 @@ void kunmap(struct page *page)
 	kunmap_high(page);
 }
 
+EXPORT_SYMBOL(kunmap);
+
 struct page *kmap_atomic_to_page(void *ptr)
 {
 	return virt_to_page(ptr);
 }
+
+
+EXPORT_SYMBOL(kmap_atomic_to_page);
diff --git a/lib/find_next_bit.c b/lib/find_next_bit.c
index d08302d2a4..c05b4b19cf 100644
--- a/lib/find_next_bit.c
+++ b/lib/find_next_bit.c
@@ -10,6 +10,7 @@
  */
 
 #include <linux/bitops.h>
+#include <linux/module.h>
 
 int find_next_bit(const unsigned long *addr, int size, int offset)
 {
@@ -53,3 +54,5 @@ int find_next_bit(const unsigned long *addr, int size, int offset)
 
 	return offset;
 }
+
+EXPORT_SYMBOL(find_next_bit);
-- 
cgit v1.2.2


From 018b8d12bc85f8fb332239b11d919ea0724c49a4 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sun, 8 Jan 2006 01:01:19 -0800
Subject: [PATCH] frv: support module exception tables

Fix the exception table handling so that modules exceptions are dealt with.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/frv/mm/extable.c | 34 +++++++++-------------------------
 1 file changed, 9 insertions(+), 25 deletions(-)

diff --git a/arch/frv/mm/extable.c b/arch/frv/mm/extable.c
index 41be1128dc..caacf030ac 100644
--- a/arch/frv/mm/extable.c
+++ b/arch/frv/mm/extable.c
@@ -43,7 +43,7 @@ static inline unsigned long search_one_table(const struct exception_table_entry
  */
 unsigned long search_exception_table(unsigned long pc)
 {
-	unsigned long ret = 0;
+	const struct exception_table_entry *extab;
 
 	/* determine if the fault lay during a memcpy_user or a memset_user */
 	if (__frame->lr == (unsigned long) &__memset_user_error_lr &&
@@ -55,9 +55,10 @@ unsigned long search_exception_table(unsigned long pc)
 		 */
 		return (unsigned long) &__memset_user_error_handler;
 	}
-	else if (__frame->lr == (unsigned long) &__memcpy_user_error_lr &&
-		 (unsigned long) &memcpy <= pc && pc < (unsigned long) &__memcpy_end
-		 ) {
+
+	if (__frame->lr == (unsigned long) &__memcpy_user_error_lr &&
+	    (unsigned long) &memcpy <= pc && pc < (unsigned long) &__memcpy_end
+	    ) {
 		/* the fault occurred in a protected memset
 		 * - we search for the return address (in LR) instead of the program counter
 		 * - it was probably during a copy_to/from_user()
@@ -65,27 +66,10 @@ unsigned long search_exception_table(unsigned long pc)
 		return (unsigned long) &__memcpy_user_error_handler;
 	}
 
-#ifndef CONFIG_MODULES
-	/* there is only the kernel to search.  */
-	ret = search_one_table(__start___ex_table, __stop___ex_table - 1, pc);
-	return ret;
-
-#else
-	/* the kernel is the last "module" -- no need to treat it special */
-	unsigned long flags;
-	struct module *mp;
+	extab = search_exception_tables(pc);
+	if (extab)
+		return extab->fixup;
 
-	spin_lock_irqsave(&modlist_lock, flags);
-
-	for (mp = module_list; mp != NULL; mp = mp->next) {
-		if (mp->ex_table_start == NULL || !(mp->flags & (MOD_RUNNING | MOD_INITIALIZING)))
-			continue;
-		ret = search_one_table(mp->ex_table_start, mp->ex_table_end - 1, pc);
-		if (ret)
-			break;
-	}
+	return 0;
 
-	spin_unlock_irqrestore(&modlist_lock, flags);
-	return ret;
-#endif
 } /* end search_exception_table() */
-- 
cgit v1.2.2


From a90a72c85fb202d0b172da27d8df2579b6591783 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sun, 8 Jan 2006 01:01:20 -0800
Subject: [PATCH] frv: supply various missing I/O access primitives

Supply various I/O access primitives that are missing for the FRV arch:

 (*) mmiowb()

 (*) read*_relaxed()

 (*) ioport_*map()

 (*) ioread*(), iowrite*(), ioread*_rep() and iowrite*_rep()

 (*) pci_io*map()

 (*) check_signature()

The patch also makes __is_PCI_addr() more efficient.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-frv/io.h      | 123 ++++++++++++++++++++++++++++++++++++++++++++++
 include/asm-frv/mb-regs.h |   4 ++
 2 files changed, 127 insertions(+)

diff --git a/include/asm-frv/io.h b/include/asm-frv/io.h
index 48829f7272..075369b1a3 100644
--- a/include/asm-frv/io.h
+++ b/include/asm-frv/io.h
@@ -18,6 +18,7 @@
 #ifdef __KERNEL__
 
 #include <linux/config.h>
+#include <linux/types.h>
 #include <asm/virtconvert.h>
 #include <asm/string.h>
 #include <asm/mb-regs.h>
@@ -104,6 +105,8 @@ static inline void __insl(unsigned long addr, void *buf, int len, int swap)
 		__insl_sw(addr, buf, len);
 }
 
+#define mmiowb() mb()
+
 /*
  *	make the short names macros so specific devices
  *	can override them as required
@@ -209,6 +212,10 @@ static inline uint32_t readl(const volatile void __iomem *addr)
 	return ret;
 }
 
+#define readb_relaxed readb
+#define readw_relaxed readw
+#define readl_relaxed readl
+
 static inline void writeb(uint8_t datum, volatile void __iomem *addr)
 {
 	__builtin_write8((volatile uint8_t __force *) addr, datum);
@@ -268,11 +275,106 @@ static inline void __iomem *ioremap_fullcache(unsigned long physaddr, unsigned l
 
 extern void iounmap(void __iomem *addr);
 
+static inline void __iomem *ioport_map(unsigned long port, unsigned int nr)
+{
+	return (void __iomem *) port;
+}
+
+static inline void ioport_unmap(void __iomem *p)
+{
+}
+
 static inline void flush_write_buffers(void)
 {
 	__asm__ __volatile__ ("membar" : : :"memory");
 }
 
+/*
+ * do appropriate I/O accesses for token type
+ */
+static inline unsigned int ioread8(void __iomem *p)
+{
+	return __builtin_read8(p);
+}
+
+static inline unsigned int ioread16(void __iomem *p)
+{
+	uint16_t ret = __builtin_read16(p);
+	if (__is_PCI_addr(p))
+		ret = _swapw(ret);
+	return ret;
+}
+
+static inline unsigned int ioread32(void __iomem *p)
+{
+	uint32_t ret = __builtin_read32(p);
+	if (__is_PCI_addr(p))
+		ret = _swapl(ret);
+	return ret;
+}
+
+static inline void iowrite8(u8 val, void __iomem *p)
+{
+	__builtin_write8(p, val);
+	if (__is_PCI_MEM(p))
+		__flush_PCI_writes();
+}
+
+static inline void iowrite16(u16 val, void __iomem *p)
+{
+	if (__is_PCI_addr(p))
+		val = _swapw(val);
+	__builtin_write16(p, val);
+	if (__is_PCI_MEM(p))
+		__flush_PCI_writes();
+}
+
+static inline void iowrite32(u32 val, void __iomem *p)
+{
+	if (__is_PCI_addr(p))
+		val = _swapl(val);
+	__builtin_write32(p, val);
+	if (__is_PCI_MEM(p))
+		__flush_PCI_writes();
+}
+
+static inline void ioread8_rep(void __iomem *p, void *dst, unsigned long count)
+{
+	io_insb((unsigned long) p, dst, count);
+}
+
+static inline void ioread16_rep(void __iomem *p, void *dst, unsigned long count)
+{
+	io_insw((unsigned long) p, dst, count);
+}
+
+static inline void ioread32_rep(void __iomem *p, void *dst, unsigned long count)
+{
+	__insl_ns((unsigned long) p, dst, count);
+}
+
+static inline void iowrite8_rep(void __iomem *p, const void *src, unsigned long count)
+{
+	io_outsb((unsigned long) p, src, count);
+}
+
+static inline void iowrite16_rep(void __iomem *p, const void *src, unsigned long count)
+{
+	io_outsw((unsigned long) p, src, count);
+}
+
+static inline void iowrite32_rep(void __iomem *p, const void *src, unsigned long count)
+{
+	__outsl_ns((unsigned long) p, src, count);
+}
+
+/* Create a virtual mapping cookie for a PCI BAR (memory or IO) */
+struct pci_dev;
+extern void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long max);
+static inline void pci_iounmap(struct pci_dev *dev, void __iomem *p)
+{
+}
+
 
 /*
  * Convert a physical pointer to a virtual kernel pointer for /dev/mem
@@ -285,6 +387,27 @@ static inline void flush_write_buffers(void)
  */
 #define xlate_dev_kmem_ptr(p)	p
 
+/*
+ * Check BIOS signature
+ */
+static inline int check_signature(volatile void __iomem *io_addr,
+				  const unsigned char *signature, int length)
+{
+	int retval = 0;
+
+	do {
+		if (readb(io_addr) != *signature)
+			goto out;
+		io_addr++;
+		signature++;
+		length--;
+	} while (length);
+
+	retval = 1;
+out:
+	return retval;
+}
+
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_IO_H */
diff --git a/include/asm-frv/mb-regs.h b/include/asm-frv/mb-regs.h
index c8f575fc42..93fa732fb0 100644
--- a/include/asm-frv/mb-regs.h
+++ b/include/asm-frv/mb-regs.h
@@ -68,6 +68,9 @@ do {									\
 #define __is_PCI_MEM(addr) \
 	((unsigned long)(addr) - __region_PCI_MEM < 0x08000000UL)
 
+#define __is_PCI_addr(addr) \
+	((unsigned long)(addr) - __region_PCI_IO < 0x0c000000UL)
+
 #define __get_CLKSW()	({ *(volatile unsigned long *)(__region_CS2 + 0x0130000cUL) & 0xffUL; })
 #define __get_CLKIN()	(__get_CLKSW() * 125U * 100000U / 24U)
 
@@ -149,6 +152,7 @@ do {									\
 
 #define __is_PCI_IO(addr)	0	/* no PCI */
 #define __is_PCI_MEM(addr)	0
+#define __is_PCI_addr(addr)	0
 #define __region_PCI_IO		0
 #define __region_PCI_MEM	0
 #define __flush_PCI_writes()	do { } while(0)
-- 
cgit v1.2.2


From 00d76710c253341b1e84795923e59ccdb099628f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sun, 8 Jan 2006 01:01:21 -0800
Subject: [PATCH] frv: add module support stubs

Add stubs for FRV module support.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/frv/kernel/Makefile |  1 +
 arch/frv/kernel/module.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/asm-frv/module.h | 16 +++++++---
 3 files changed, 93 insertions(+), 4 deletions(-)
 create mode 100644 arch/frv/kernel/module.c

diff --git a/arch/frv/kernel/Makefile b/arch/frv/kernel/Makefile
index 422f30ede5..5a827b349b 100644
--- a/arch/frv/kernel/Makefile
+++ b/arch/frv/kernel/Makefile
@@ -21,3 +21,4 @@ obj-$(CONFIG_PM)		+= pm.o cmode.o
 obj-$(CONFIG_MB93093_PDK)	+= pm-mb93093.o
 obj-$(CONFIG_SYSCTL)		+= sysctl.o
 obj-$(CONFIG_FUTEX)		+= futex.o
+obj-$(CONFIG_MODULES)		+= module.o
diff --git a/arch/frv/kernel/module.c b/arch/frv/kernel/module.c
new file mode 100644
index 0000000000..850d168f69
--- /dev/null
+++ b/arch/frv/kernel/module.c
@@ -0,0 +1,80 @@
+/* module.c: FRV specific module loading bits
+ *
+ * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ * - Derived from arch/i386/kernel/module.c, Copyright (C) 2001 Rusty Russell.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/moduleloader.h>
+#include <linux/elf.h>
+#include <linux/vmalloc.h>
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(fmt...)
+#endif
+
+void *module_alloc(unsigned long size)
+{
+	if (size == 0)
+		return NULL;
+
+	return vmalloc_exec(size);
+}
+
+
+/* Free memory returned from module_alloc */
+void module_free(struct module *mod, void *module_region)
+{
+	vfree(module_region);
+	/* FIXME: If module_region == mod->init_region, trim exception
+           table entries. */
+}
+
+/* We don't need anything special. */
+int module_frob_arch_sections(Elf_Ehdr *hdr,
+			      Elf_Shdr *sechdrs,
+			      char *secstrings,
+			      struct module *mod)
+{
+	return 0;
+}
+
+int apply_relocate(Elf32_Shdr *sechdrs,
+		   const char *strtab,
+		   unsigned int symindex,
+		   unsigned int relsec,
+		   struct module *me)
+{
+	printk(KERN_ERR "module %s: ADD RELOCATION unsupported\n", me->name);
+	return -ENOEXEC;
+}
+
+int apply_relocate_add(Elf32_Shdr *sechdrs,
+		       const char *strtab,
+		       unsigned int symindex,
+		       unsigned int relsec,
+		       struct module *me)
+{
+	printk(KERN_ERR "module %s: ADD RELOCATION unsupported\n", me->name);
+	return -ENOEXEC;
+}
+
+int module_finalize(const Elf_Ehdr *hdr,
+		    const Elf_Shdr *sechdrs,
+		    struct module *me)
+{
+	return 0;
+}
+
+void module_arch_cleanup(struct module *mod)
+{
+}
diff --git a/include/asm-frv/module.h b/include/asm-frv/module.h
index 3223cfaef7..3d5c636028 100644
--- a/include/asm-frv/module.h
+++ b/include/asm-frv/module.h
@@ -11,10 +11,18 @@
 #ifndef _ASM_MODULE_H
 #define _ASM_MODULE_H
 
-#define module_map(x)		vmalloc(x)
-#define module_unmap(x)		vfree(x)
-#define module_arch_init(x)	(0)
-#define arch_init_modules(x)	do { } while (0)
+struct mod_arch_specific
+{
+};
+
+#define Elf_Shdr	Elf32_Shdr
+#define Elf_Sym		Elf32_Sym
+#define Elf_Ehdr	Elf32_Ehdr
+
+/*
+ * Include the architecture version.
+ */
+#define MODULE_ARCH_VERMAGIC __stringify(PROCESSOR_MODEL_NAME) " "
 
 #endif /* _ASM_MODULE_H */
 
-- 
cgit v1.2.2


From a2a88f88782e8815cafa22f2b9e22234b9a61d0a Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sun, 8 Jan 2006 01:01:22 -0800
Subject: [PATCH] frv: add pci_iomap

Implement pci_iomap() for FRV.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/frv/mb93090-mb00/Makefile    |  2 +-
 arch/frv/mb93090-mb00/pci-iomap.c | 29 +++++++++++++++++++++++++++++
 2 files changed, 30 insertions(+), 1 deletion(-)
 create mode 100644 arch/frv/mb93090-mb00/pci-iomap.c

diff --git a/arch/frv/mb93090-mb00/Makefile b/arch/frv/mb93090-mb00/Makefile
index 3faf0f8cf9..76595e8707 100644
--- a/arch/frv/mb93090-mb00/Makefile
+++ b/arch/frv/mb93090-mb00/Makefile
@@ -3,7 +3,7 @@
 #
 
 ifeq "$(CONFIG_PCI)" "y"
-obj-y := pci-frv.o pci-irq.o pci-vdk.o
+obj-y := pci-frv.o pci-irq.o pci-vdk.o pci-iomap.o
 
 ifeq "$(CONFIG_MMU)" "y"
 obj-y += pci-dma.o
diff --git a/arch/frv/mb93090-mb00/pci-iomap.c b/arch/frv/mb93090-mb00/pci-iomap.c
new file mode 100644
index 0000000000..068fa04bd5
--- /dev/null
+++ b/arch/frv/mb93090-mb00/pci-iomap.c
@@ -0,0 +1,29 @@
+/* pci-iomap.c: description
+ *
+ * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/pci.h>
+#include <linux/module.h>
+
+void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen)
+{
+	unsigned long start = pci_resource_start(dev, bar);
+	unsigned long len = pci_resource_len(dev, bar);
+	unsigned long flags = pci_resource_flags(dev, bar);
+
+	if (!len || !start)
+		return NULL;
+
+	if ((flags & IORESOURCE_IO) || (flags & IORESOURCE_MEM))
+		return (void __iomem *) start;
+
+	return NULL;
+}
+
+EXPORT_SYMBOL(pci_iomap);
-- 
cgit v1.2.2


From 7a758313905f8b085b4591e8838626274e88b969 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sun, 8 Jan 2006 01:01:22 -0800
Subject: [PATCH] frv: fix PCMCIA configuration

Fix PCMCIA configuration for FRV by including the stock PCMCIA configuration
description file.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/frv/Kconfig | 18 +-----------------
 1 file changed, 1 insertion(+), 17 deletions(-)

diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig
index ec85c0d6c6..3516a3aee8 100644
--- a/arch/frv/Kconfig
+++ b/arch/frv/Kconfig
@@ -305,23 +305,7 @@ config RESERVE_DMA_COHERENT
 
 source "drivers/pci/Kconfig"
 
-config PCMCIA
-	tristate "Use PCMCIA"
-	help
-	  Say Y here if you want to attach PCMCIA- or PC-cards to your FR-V
-	  board.  These are credit-card size devices such as network cards,
-	  modems or hard drives often used with laptops computers.  There are
-	  actually two varieties of these cards: the older 16 bit PCMCIA cards
-	  and the newer 32 bit CardBus cards.  If you want to use CardBus
-	  cards, you need to say Y here and also to "CardBus support" below.
-
-	  To use your PC-cards, you will need supporting software from David
-	  Hinds pcmcia-cs package (see the file <file:Documentation/Changes>
-	  for location).  Please also read the PCMCIA-HOWTO, available from
-	  <http://www.tldp.org/docs.html#howto>.
-
-	  To compile this driver as modules, choose M here: the
-	  modules will be called pcmcia_core and ds.
+source "drivers/pcmcia/Kconfig"
 
 #config MATH_EMULATION
 #	bool "Math emulation support (EXPERIMENTAL)"
-- 
cgit v1.2.2


From f8aec7573b87d2bc09cafab459476953353d2efa Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sun, 8 Jan 2006 01:01:23 -0800
Subject: [PATCH] frv: force serial driver inclusion

Force the 8230 serial driver to be built in if the on-CPU UARTs are to be
used.  It can't be used as a module because the arch setup needs to call into
it.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/frv/Kconfig        | 5 +++++
 arch/frv/kernel/setup.c | 2 ++
 2 files changed, 7 insertions(+)

diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig
index 3516a3aee8..61261b78ce 100644
--- a/arch/frv/Kconfig
+++ b/arch/frv/Kconfig
@@ -274,6 +274,11 @@ config GPREL_DATA_NONE
 
 endchoice
 
+config FRV_ONCPU_SERIAL
+	bool "Use on-CPU serial ports"
+	select SERIAL_8250
+	default y
+
 config PCI
 	bool "Use PCI"
 	depends on MB93090_MB00
diff --git a/arch/frv/kernel/setup.c b/arch/frv/kernel/setup.c
index 767ebb55bd..5908deae96 100644
--- a/arch/frv/kernel/setup.c
+++ b/arch/frv/kernel/setup.c
@@ -787,6 +787,7 @@ void __init setup_arch(char **cmdline_p)
 #endif
 
 	/* register those serial ports that are available */
+#ifdef CONFIG_FRV_ONCPU_SERIAL
 #ifndef CONFIG_GDBSTUB_UART0
 	__reg(UART0_BASE + UART_IER * 8) = 0;
 	early_serial_setup(&__frv_uart0);
@@ -795,6 +796,7 @@ void __init setup_arch(char **cmdline_p)
 	__reg(UART1_BASE + UART_IER * 8) = 0;
 	early_serial_setup(&__frv_uart1);
 #endif
+#endif
 
 #if defined(CONFIG_CHR_DEV_FLASH) || defined(CONFIG_BLK_DEV_FLASH)
 	/* we need to initialize the Flashrom device here since we might
-- 
cgit v1.2.2


From 5c15d41bab185431a9a28982b5aaac251dde4556 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sun, 8 Jan 2006 01:01:24 -0800
Subject: [PATCH] frv: make get_user macro cast pointers

Make the get_user macro cast the source pointer to an appropriate type for the
specified size.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-frv/uaccess.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/asm-frv/uaccess.h b/include/asm-frv/uaccess.h
index 991b50fbba..b6bcbe01f6 100644
--- a/include/asm-frv/uaccess.h
+++ b/include/asm-frv/uaccess.h
@@ -180,16 +180,16 @@ do {						\
 									\
 	switch (sizeof(*(ptr))) {					\
 	case 1:								\
-		__get_user_asm(__gu_err, __gu_val, ptr, "ub", "=r");	\
+		__get_user_asm(__gu_err, *(u8*)&__gu_val, ptr, "ub", "=r"); \
 		break;							\
 	case 2:								\
-		__get_user_asm(__gu_err, __gu_val, ptr, "uh", "=r");	\
+		__get_user_asm(__gu_err, *(u16*)&__gu_val, ptr, "uh", "=r"); \
 		break;							\
 	case 4:								\
-		__get_user_asm(__gu_err, __gu_val, ptr, "", "=r");	\
+		__get_user_asm(__gu_err, *(u32*)&__gu_val, ptr, "", "=r"); \
 		break;							\
 	case 8:								\
-		__get_user_asm(__gu_err, __gu_val, ptr, "d", "=e");	\
+		__get_user_asm(__gu_err, *(u64*)&__gu_val, ptr, "d", "=e"); \
 		break;							\
 	default:							\
 		__gu_err = __get_user_bad();				\
-- 
cgit v1.2.2


From 41be6aef38c08f1f85ac1c4bd8191b0d1ec61b4c Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sun, 8 Jan 2006 01:01:25 -0800
Subject: [PATCH] frv: miscellaneous changes

Fix a number of miscellanous items:

 (1) Declare lock sections in the linker script.

 (2) Recurse in the correct manner in the arch makefile.

 (3) asm/bug.h requires asm/linkage.h to be included first. One C file puts
     asm/bug.h first.

 (4) Add an empty RTC header file to avoid missing header file errors.

 (5) sg_dma_address() should use the dma_address member of a scatter list.

 (6) Add trivial pci_unmap support.

 (7) Add pgprot_noncached()

 (8) Discard u_quad_t.

 (9) Use ~0UL rather than ULONG_MAX in unistd.h in case the latter isn't
     declared.

(10) Add an empty VGA header file to avoid missing header file errors.

(11) Add an XOR header file to use the generic XOR stuff.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/frv/Makefile             |  6 +++---
 arch/frv/kernel/vmlinux.lds.S |  1 +
 include/asm-frv/bug.h         |  1 +
 include/asm-frv/dma-mapping.h |  2 +-
 include/asm-frv/mc146818rtc.h | 16 ++++++++++++++++
 include/asm-frv/pci.h         |  8 ++++++++
 include/asm-frv/pgtable.h     |  5 +++++
 include/asm-frv/types.h       |  1 -
 include/asm-frv/unistd.h      |  2 +-
 include/asm-frv/vga.h         | 17 +++++++++++++++++
 include/asm-frv/xor.h         |  1 +
 11 files changed, 54 insertions(+), 6 deletions(-)
 create mode 100644 include/asm-frv/mc146818rtc.h
 create mode 100644 include/asm-frv/vga.h
 create mode 100644 include/asm-frv/xor.h

diff --git a/arch/frv/Makefile b/arch/frv/Makefile
index 54046d2386..90c0fb8d9d 100644
--- a/arch/frv/Makefile
+++ b/arch/frv/Makefile
@@ -109,10 +109,10 @@ bootstrap:
 	$(Q)$(MAKEBOOT) bootstrap
 
 archmrproper:
-	$(Q)$(MAKE) -C arch/frv/boot mrproper
+	$(Q)$(MAKE) $(build)=arch/frv/boot mrproper
 
 archclean:
-	$(Q)$(MAKE) -C arch/frv/boot clean
+	$(Q)$(MAKE) $(build)=arch/frv/boot clean
 
 archdep: scripts/mkdep symlinks
-	$(Q)$(MAKE) -C arch/frv/boot dep
+	$(Q)$(MAKE) $(build)=arch/frv/boot dep
diff --git a/arch/frv/kernel/vmlinux.lds.S b/arch/frv/kernel/vmlinux.lds.S
index fceafd2cc2..f474534ba7 100644
--- a/arch/frv/kernel/vmlinux.lds.S
+++ b/arch/frv/kernel/vmlinux.lds.S
@@ -112,6 +112,7 @@ SECTIONS
 #endif
 	)
 	SCHED_TEXT
+	LOCK_TEXT
 	*(.fixup)
 	*(.gnu.warning)
 	*(.exitcall.exit)
diff --git a/include/asm-frv/bug.h b/include/asm-frv/bug.h
index 074c0d5770..451712cc30 100644
--- a/include/asm-frv/bug.h
+++ b/include/asm-frv/bug.h
@@ -12,6 +12,7 @@
 #define _ASM_BUG_H
 
 #include <linux/config.h>
+#include <linux/linkage.h>
 
 #ifdef CONFIG_BUG
 /*
diff --git a/include/asm-frv/dma-mapping.h b/include/asm-frv/dma-mapping.h
index 5003e017fd..e9fc1d4779 100644
--- a/include/asm-frv/dma-mapping.h
+++ b/include/asm-frv/dma-mapping.h
@@ -23,7 +23,7 @@ void dma_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t
  * returns, or alternatively stop on the first sg_dma_len(sg) which
  * is 0.
  */
-#define sg_dma_address(sg)	((unsigned long) (page_to_phys((sg)->page) + (sg)->offset))
+#define sg_dma_address(sg)	((sg)->dma_address)
 #define sg_dma_len(sg)		((sg)->length)
 
 /*
diff --git a/include/asm-frv/mc146818rtc.h b/include/asm-frv/mc146818rtc.h
new file mode 100644
index 0000000000..90dfb7a633
--- /dev/null
+++ b/include/asm-frv/mc146818rtc.h
@@ -0,0 +1,16 @@
+/* mc146818rtc.h: RTC defs
+ *
+ * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _ASM_MC146818RTC_H
+#define _ASM_MC146818RTC_H
+
+
+#endif /* _ASM_MC146818RTC_H */
diff --git a/include/asm-frv/pci.h b/include/asm-frv/pci.h
index 1168451c27..598b0c6b69 100644
--- a/include/asm-frv/pci.h
+++ b/include/asm-frv/pci.h
@@ -57,6 +57,14 @@ extern void pci_free_consistent(struct pci_dev *hwdev, size_t size,
  */
 #define PCI_DMA_BUS_IS_PHYS	(1)
 
+/* pci_unmap_{page,single} is a nop so... */
+#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)
+#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)
+#define pci_unmap_addr(PTR, ADDR_NAME)		(0)
+#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL)	do { } while (0)
+#define pci_unmap_len(PTR, LEN_NAME)		(0)
+#define pci_unmap_len_set(PTR, LEN_NAME, VAL)	do { } while (0)
+
 #ifdef CONFIG_PCI
 static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 					enum pci_dma_burst_strategy *strat,
diff --git a/include/asm-frv/pgtable.h b/include/asm-frv/pgtable.h
index 844666377d..d1c3b182c6 100644
--- a/include/asm-frv/pgtable.h
+++ b/include/asm-frv/pgtable.h
@@ -420,6 +420,11 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr,
 	asm volatile("dcf %M0" :: "U"(*ptep));
 }
 
+/*
+ * Macro to mark a page protection value as "uncacheable"
+ */
+#define pgprot_noncached(prot) (__pgprot(pgprot_val(prot) | _PAGE_NOCACHE))
+
 /*
  * Conversion functions: convert a page and protection to a page entry,
  * and a page entry and page directory to the page they refer to.
diff --git a/include/asm-frv/types.h b/include/asm-frv/types.h
index 50605df6d8..2560f596a7 100644
--- a/include/asm-frv/types.h
+++ b/include/asm-frv/types.h
@@ -59,7 +59,6 @@ typedef unsigned int u32;
 
 typedef signed long long s64;
 typedef unsigned long long u64;
-typedef u64 u_quad_t;
 
 /* Dma addresses are 32-bits wide.  */
 
diff --git a/include/asm-frv/unistd.h b/include/asm-frv/unistd.h
index 5cf989b448..cde376a7a8 100644
--- a/include/asm-frv/unistd.h
+++ b/include/asm-frv/unistd.h
@@ -313,7 +313,7 @@ do {									\
         unsigned long __sr2 = (res);					\
 	if (__builtin_expect(__sr2 >= (unsigned long)(-4095), 0)) {	\
 		errno = (-__sr2);					\
-		__sr2 = ULONG_MAX;					\
+		__sr2 = ~0UL;						\
 	}								\
 	return (type) __sr2;						\
 } while (0)
diff --git a/include/asm-frv/vga.h b/include/asm-frv/vga.h
new file mode 100644
index 0000000000..a702c800a2
--- /dev/null
+++ b/include/asm-frv/vga.h
@@ -0,0 +1,17 @@
+/* vga.h: VGA register stuff
+ *
+ * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _ASM_VGA_H
+#define _ASM_VGA_H
+
+
+
+#endif /* _ASM_VGA_H */
diff --git a/include/asm-frv/xor.h b/include/asm-frv/xor.h
new file mode 100644
index 0000000000..c82eb12a5b
--- /dev/null
+++ b/include/asm-frv/xor.h
@@ -0,0 +1 @@
+#include <asm-generic/xor.h>
-- 
cgit v1.2.2


From 2087ff3ec56eba9bccd3b3a9d4d42670b1543f5d Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sun, 8 Jan 2006 01:01:25 -0800
Subject: [PATCH] frv: fix uninitialised variable in atm nicstar driver

Fix an uninitialised variable warning in the atm nicstar driver.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/atm/nicstar.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/atm/nicstar.c b/drivers/atm/nicstar.c
index c57e20dcb0..074abc81ec 100644
--- a/drivers/atm/nicstar.c
+++ b/drivers/atm/nicstar.c
@@ -2126,8 +2126,7 @@ static void process_rsq(ns_dev *card)
 
    if (!ns_rsqe_valid(card->rsq.next))
       return;
-   while (ns_rsqe_valid(card->rsq.next))
-   {
+   do {
       dequeue_rx(card, card->rsq.next);
       ns_rsqe_init(card->rsq.next);
       previous = card->rsq.next;
@@ -2135,7 +2134,7 @@ static void process_rsq(ns_dev *card)
          card->rsq.next = card->rsq.base;
       else
          card->rsq.next++;
-   }
+   } while (ns_rsqe_valid(card->rsq.next));
    writel((((u32) previous) - ((u32) card->rsq.base)),
           card->membase + RSQH);
 }
-- 
cgit v1.2.2


From 6d524aed1f50b2b1d5b4ad5a4e2fe3f38106d0a6 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sun, 8 Jan 2006 01:01:26 -0800
Subject: [PATCH] frv: fix uninitialised variable in serverworks driver

Fix an uninitialised variable warning in the serverworks driver.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/ide/pci/serverworks.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/ide/pci/serverworks.c b/drivers/ide/pci/serverworks.c
index ff2e217a8c..0d3073f4ea 100644
--- a/drivers/ide/pci/serverworks.c
+++ b/drivers/ide/pci/serverworks.c
@@ -69,7 +69,7 @@ static int check_in_drive_lists (ide_drive_t *drive, const char **list)
 static u8 svwks_ratemask (ide_drive_t *drive)
 {
 	struct pci_dev *dev     = HWIF(drive)->pci_dev;
-	u8 mode;
+	u8 mode = 0;
 
 	if (!svwks_revision)
 		pci_read_config_byte(dev, PCI_REVISION_ID, &svwks_revision);
-- 
cgit v1.2.2


From 22fc6eccbf4ce4eb6265e6ada7b50a7b9cc57d05 Mon Sep 17 00:00:00 2001
From: Ravikiran G Thirumalai <kiran@scalex86.org>
Date: Sun, 8 Jan 2006 01:01:27 -0800
Subject: [PATCH] Change maxaligned_in_smp alignemnt macros to
 internodealigned_in_smp macros

____cacheline_maxaligned_in_smp is currently used to align critical structures
and avoid false sharing.  It uses per-arch L1_CACHE_SHIFT_MAX and people find
L1_CACHE_SHIFT_MAX useless.

However, we have been using ____cacheline_maxaligned_in_smp to align
structures on the internode cacheline size.  As per Andi's suggestion,
following patch kills ____cacheline_maxaligned_in_smp and introduces
INTERNODE_CACHE_SHIFT, which defaults to L1_CACHE_SHIFT for all arches.
Arches needing L3/Internode cacheline alignment can define
INTERNODE_CACHE_SHIFT in the arch asm/cache.h.  Patch replaces
____cacheline_maxaligned_in_smp with ____cacheline_internodealigned_in_smp

With this patch, L1_CACHE_SHIFT_MAX can be killed

Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org>
Signed-off-by: Shai Fultheim <shai@scalex86.org>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/init_task.c   |  2 +-
 arch/i386/kernel/irq.c         |  2 +-
 arch/x86_64/kernel/init_task.c |  2 +-
 include/linux/cache.h          | 17 +++++++++++++----
 include/linux/ide.h            |  2 +-
 include/linux/mmzone.h         |  4 ++--
 include/linux/rcupdate.h       |  2 +-
 kernel/rcupdate.c              |  4 ++--
 mm/sparse.c                    |  4 ++--
 9 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/arch/i386/kernel/init_task.c b/arch/i386/kernel/init_task.c
index 9caa8e8db8..cff95d10a4 100644
--- a/arch/i386/kernel/init_task.c
+++ b/arch/i386/kernel/init_task.c
@@ -42,5 +42,5 @@ EXPORT_SYMBOL(init_task);
  * per-CPU TSS segments. Threads are completely 'soft' on Linux,
  * no more per-task TSS's.
  */ 
-DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_maxaligned_in_smp = INIT_TSS;
+DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_internodealigned_in_smp = INIT_TSS;
 
diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c
index 1a201a9328..f3a9c78c4a 100644
--- a/arch/i386/kernel/irq.c
+++ b/arch/i386/kernel/irq.c
@@ -19,7 +19,7 @@
 #include <linux/cpu.h>
 #include <linux/delay.h>
 
-DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_maxaligned_in_smp;
+DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp;
 EXPORT_PER_CPU_SYMBOL(irq_stat);
 
 #ifndef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86_64/kernel/init_task.c b/arch/x86_64/kernel/init_task.c
index e0ba5c1043..ce31d904d6 100644
--- a/arch/x86_64/kernel/init_task.c
+++ b/arch/x86_64/kernel/init_task.c
@@ -44,6 +44,6 @@ EXPORT_SYMBOL(init_task);
  * section. Since TSS's are completely CPU-local, we want them
  * on exact cacheline boundaries, to eliminate cacheline ping-pong.
  */ 
-DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_maxaligned_in_smp = INIT_TSS;
+DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_internodealigned_in_smp = INIT_TSS;
 
 #define ALIGN_TO_4K __attribute__((section(".data.init_task")))
diff --git a/include/linux/cache.h b/include/linux/cache.h
index 0b7ecf3af7..ffe52210fc 100644
--- a/include/linux/cache.h
+++ b/include/linux/cache.h
@@ -45,12 +45,21 @@
 #endif /* CONFIG_SMP */
 #endif
 
-#if !defined(____cacheline_maxaligned_in_smp)
+/*
+ * The maximum alignment needed for some critical structures
+ * These could be inter-node cacheline sizes/L3 cacheline
+ * size etc.  Define this in asm/cache.h for your arch
+ */
+#ifndef INTERNODE_CACHE_SHIFT
+#define INTERNODE_CACHE_SHIFT L1_CACHE_SHIFT
+#endif
+
+#if !defined(____cacheline_internodealigned_in_smp)
 #if defined(CONFIG_SMP)
-#define ____cacheline_maxaligned_in_smp \
-	__attribute__((__aligned__(1 << (L1_CACHE_SHIFT_MAX))))
+#define ____cacheline_internodealigned_in_smp \
+	__attribute__((__aligned__(1 << (INTERNODE_CACHE_SHIFT))))
 #else
-#define ____cacheline_maxaligned_in_smp
+#define ____cacheline_internodealigned_in_smp
 #endif
 #endif
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 7b6a6a58e4..4dd6694963 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -801,7 +801,7 @@ typedef struct hwif_s {
 	unsigned dma;
 
 	void (*led_act)(void *data, int rw);
-} ____cacheline_maxaligned_in_smp ide_hwif_t;
+} ____cacheline_internodealigned_in_smp ide_hwif_t;
 
 /*
  *  internal ide interrupt handler type
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 2a89c132ba..7e4ae6ab19 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -38,7 +38,7 @@ struct pglist_data;
 #if defined(CONFIG_SMP)
 struct zone_padding {
 	char x[0];
-} ____cacheline_maxaligned_in_smp;
+} ____cacheline_internodealigned_in_smp;
 #define ZONE_PADDING(name)	struct zone_padding name;
 #else
 #define ZONE_PADDING(name)
@@ -233,7 +233,7 @@ struct zone {
 	 * rarely used fields:
 	 */
 	char			*name;
-} ____cacheline_maxaligned_in_smp;
+} ____cacheline_internodealigned_in_smp;
 
 
 /*
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index a471f3bb71..51747cd88d 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -65,7 +65,7 @@ struct rcu_ctrlblk {
 	long	cur;		/* Current batch number.                      */
 	long	completed;	/* Number of the last completed batch         */
 	int	next_pending;	/* Is the next batch already waiting?         */
-} ____cacheline_maxaligned_in_smp;
+} ____cacheline_internodealigned_in_smp;
 
 /* Is batch a before batch b ? */
 static inline int rcu_batch_before(long a, long b)
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 48d3bce465..c9afc61240 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -61,9 +61,9 @@ struct rcu_state {
 	                              /* for current batch to proceed.        */
 };
 
-static struct rcu_state rcu_state ____cacheline_maxaligned_in_smp =
+static struct rcu_state rcu_state ____cacheline_internodealigned_in_smp =
 	  {.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE };
-static struct rcu_state rcu_bh_state ____cacheline_maxaligned_in_smp =
+static struct rcu_state rcu_bh_state ____cacheline_internodealigned_in_smp =
 	  {.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE };
 
 DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
diff --git a/mm/sparse.c b/mm/sparse.c
index 72079b538e..0a51f36ba3 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -18,10 +18,10 @@
  */
 #ifdef CONFIG_SPARSEMEM_EXTREME
 struct mem_section *mem_section[NR_SECTION_ROOTS]
-	____cacheline_maxaligned_in_smp;
+	____cacheline_internodealigned_in_smp;
 #else
 struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
-	____cacheline_maxaligned_in_smp;
+	____cacheline_internodealigned_in_smp;
 #endif
 EXPORT_SYMBOL(mem_section);
 
-- 
cgit v1.2.2


From 1fd73c6b6737b7e6eacac1b00dac16e7540c3cb1 Mon Sep 17 00:00:00 2001
From: Ravikiran G Thirumalai <kiran@scalex86.org>
Date: Sun, 8 Jan 2006 01:01:28 -0800
Subject: [PATCH] Kill L1_CACHE_SHIFT_MAX

Kill L1_CACHE_SHIFT from all arches.  Since L1_CACHE_SHIFT_MAX is not used
anymore with the introduction of INTERNODE_CACHE, kill L1_CACHE_SHIFT_MAX.

Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org>
Signed-off-by: Shai Fultheim <shai@scalex86.org>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-alpha/cache.h         | 1 -
 include/asm-arm/cache.h           | 5 -----
 include/asm-cris/arch-v10/cache.h | 1 -
 include/asm-cris/arch-v32/cache.h | 1 -
 include/asm-cris/dma-mapping.h    | 2 +-
 include/asm-generic/dma-mapping.h | 2 +-
 include/asm-i386/cache.h          | 2 --
 include/asm-i386/dma-mapping.h    | 2 +-
 include/asm-ia64/cache.h          | 2 --
 include/asm-m32r/cache.h          | 2 --
 include/asm-m68k/cache.h          | 2 --
 include/asm-mips/cache.h          | 1 -
 include/asm-parisc/cache.h        | 1 -
 include/asm-powerpc/cache.h       | 1 -
 include/asm-powerpc/dma-mapping.h | 2 +-
 include/asm-s390/cache.h          | 1 -
 include/asm-sh/cache.h            | 2 --
 include/asm-sh64/cache.h          | 2 --
 include/asm-sparc/cache.h         | 1 -
 include/asm-sparc64/cache.h       | 1 -
 include/asm-um/cache.h            | 3 ---
 include/asm-v850/cache.h          | 2 --
 include/asm-x86_64/cache.h        | 1 -
 23 files changed, 4 insertions(+), 36 deletions(-)

diff --git a/include/asm-alpha/cache.h b/include/asm-alpha/cache.h
index e69b29501a..e6d4d1695e 100644
--- a/include/asm-alpha/cache.h
+++ b/include/asm-alpha/cache.h
@@ -20,6 +20,5 @@
 
 #define L1_CACHE_ALIGN(x)  (((x)+(L1_CACHE_BYTES-1))&~(L1_CACHE_BYTES-1))
 #define SMP_CACHE_BYTES    L1_CACHE_BYTES
-#define L1_CACHE_SHIFT_MAX L1_CACHE_SHIFT
 
 #endif
diff --git a/include/asm-arm/cache.h b/include/asm-arm/cache.h
index 8d161f7c87..31332c8ac0 100644
--- a/include/asm-arm/cache.h
+++ b/include/asm-arm/cache.h
@@ -7,9 +7,4 @@
 #define L1_CACHE_SHIFT		5
 #define L1_CACHE_BYTES		(1 << L1_CACHE_SHIFT)
 
-/*
- * largest L1 which this arch supports
- */
-#define L1_CACHE_SHIFT_MAX	5
-
 #endif
diff --git a/include/asm-cris/arch-v10/cache.h b/include/asm-cris/arch-v10/cache.h
index 1d1d1ba65b..aea27184d2 100644
--- a/include/asm-cris/arch-v10/cache.h
+++ b/include/asm-cris/arch-v10/cache.h
@@ -4,6 +4,5 @@
 /* Etrax 100LX have 32-byte cache-lines. */
 #define L1_CACHE_BYTES 32
 #define L1_CACHE_SHIFT 5
-#define L1_CACHE_SHIFT_MAX 5
 
 #endif /* _ASM_ARCH_CACHE_H */
diff --git a/include/asm-cris/arch-v32/cache.h b/include/asm-cris/arch-v32/cache.h
index 4fed8d62cc..80b236b153 100644
--- a/include/asm-cris/arch-v32/cache.h
+++ b/include/asm-cris/arch-v32/cache.h
@@ -4,6 +4,5 @@
 /* A cache-line is 32 bytes. */
 #define L1_CACHE_BYTES 32
 #define L1_CACHE_SHIFT 5
-#define L1_CACHE_SHIFT_MAX 5
 
 #endif /* _ASM_CRIS_ARCH_CACHE_H */
diff --git a/include/asm-cris/dma-mapping.h b/include/asm-cris/dma-mapping.h
index 8eff51349a..cbf1a98f01 100644
--- a/include/asm-cris/dma-mapping.h
+++ b/include/asm-cris/dma-mapping.h
@@ -153,7 +153,7 @@ dma_set_mask(struct device *dev, u64 mask)
 static inline int
 dma_get_cache_alignment(void)
 {
-	return (1 << L1_CACHE_SHIFT_MAX);
+	return (1 << INTERNODE_CACHE_SHIFT);
 }
 
 #define dma_is_consistent(d)	(1)
diff --git a/include/asm-generic/dma-mapping.h b/include/asm-generic/dma-mapping.h
index 747d790295..1b35620771 100644
--- a/include/asm-generic/dma-mapping.h
+++ b/include/asm-generic/dma-mapping.h
@@ -274,7 +274,7 @@ dma_get_cache_alignment(void)
 {
 	/* no easy way to get cache size on all processors, so return
 	 * the maximum possible, to be safe */
-	return (1 << L1_CACHE_SHIFT_MAX);
+	return (1 << INTERNODE_CACHE_SHIFT);
 }
 
 static inline void
diff --git a/include/asm-i386/cache.h b/include/asm-i386/cache.h
index 849788710f..615911e5bd 100644
--- a/include/asm-i386/cache.h
+++ b/include/asm-i386/cache.h
@@ -10,6 +10,4 @@
 #define L1_CACHE_SHIFT	(CONFIG_X86_L1_CACHE_SHIFT)
 #define L1_CACHE_BYTES	(1 << L1_CACHE_SHIFT)
 
-#define L1_CACHE_SHIFT_MAX 7	/* largest L1 which this arch supports */
-
 #endif
diff --git a/include/asm-i386/dma-mapping.h b/include/asm-i386/dma-mapping.h
index e56c335f8e..6c37a9ab8d 100644
--- a/include/asm-i386/dma-mapping.h
+++ b/include/asm-i386/dma-mapping.h
@@ -150,7 +150,7 @@ dma_get_cache_alignment(void)
 {
 	/* no easy way to get cache size on all x86, so return the
 	 * maximum possible, to be safe */
-	return (1 << L1_CACHE_SHIFT_MAX);
+	return (1 << INTERNODE_CACHE_SHIFT);
 }
 
 #define dma_is_consistent(d)	(1)
diff --git a/include/asm-ia64/cache.h b/include/asm-ia64/cache.h
index 666d8f175c..40dd25195d 100644
--- a/include/asm-ia64/cache.h
+++ b/include/asm-ia64/cache.h
@@ -12,8 +12,6 @@
 #define L1_CACHE_SHIFT		CONFIG_IA64_L1_CACHE_SHIFT
 #define L1_CACHE_BYTES		(1 << L1_CACHE_SHIFT)
 
-#define L1_CACHE_SHIFT_MAX 7	/* largest L1 which this arch supports */
-
 #ifdef CONFIG_SMP
 # define SMP_CACHE_SHIFT	L1_CACHE_SHIFT
 # define SMP_CACHE_BYTES	L1_CACHE_BYTES
diff --git a/include/asm-m32r/cache.h b/include/asm-m32r/cache.h
index 7248205969..9c2b2d9998 100644
--- a/include/asm-m32r/cache.h
+++ b/include/asm-m32r/cache.h
@@ -7,6 +7,4 @@
 #define L1_CACHE_SHIFT		4
 #define L1_CACHE_BYTES		(1 << L1_CACHE_SHIFT)
 
-#define L1_CACHE_SHIFT_MAX	4
-
 #endif  /* _ASM_M32R_CACHE_H */
diff --git a/include/asm-m68k/cache.h b/include/asm-m68k/cache.h
index 6161fd3d86..fed3fd30de 100644
--- a/include/asm-m68k/cache.h
+++ b/include/asm-m68k/cache.h
@@ -8,6 +8,4 @@
 #define        L1_CACHE_SHIFT  4
 #define        L1_CACHE_BYTES  (1<< L1_CACHE_SHIFT)
 
-#define L1_CACHE_SHIFT_MAX 4	/* largest L1 which this arch supports */
-
 #endif
diff --git a/include/asm-mips/cache.h b/include/asm-mips/cache.h
index 1a5d1a669d..55e19f2ff0 100644
--- a/include/asm-mips/cache.h
+++ b/include/asm-mips/cache.h
@@ -15,7 +15,6 @@
 #define L1_CACHE_SHIFT		CONFIG_MIPS_L1_CACHE_SHIFT
 #define L1_CACHE_BYTES		(1 << L1_CACHE_SHIFT)
 
-#define L1_CACHE_SHIFT_MAX	6
 #define SMP_CACHE_SHIFT		L1_CACHE_SHIFT
 #define SMP_CACHE_BYTES		L1_CACHE_BYTES
 
diff --git a/include/asm-parisc/cache.h b/include/asm-parisc/cache.h
index 5da72e38bd..38d201b565 100644
--- a/include/asm-parisc/cache.h
+++ b/include/asm-parisc/cache.h
@@ -28,7 +28,6 @@
 #define L1_CACHE_ALIGN(x)       (((x)+(L1_CACHE_BYTES-1))&~(L1_CACHE_BYTES-1))
 
 #define SMP_CACHE_BYTES L1_CACHE_BYTES
-#define L1_CACHE_SHIFT_MAX 5	/* largest L1 which this arch supports */
 
 extern void flush_data_cache_local(void);  /* flushes local data-cache only */
 extern void flush_instruction_cache_local(void); /* flushes local code-cache only */
diff --git a/include/asm-powerpc/cache.h b/include/asm-powerpc/cache.h
index 26ce502e76..6379c2df5c 100644
--- a/include/asm-powerpc/cache.h
+++ b/include/asm-powerpc/cache.h
@@ -19,7 +19,6 @@
 #define	L1_CACHE_BYTES		(1 << L1_CACHE_SHIFT)
 
 #define	SMP_CACHE_BYTES		L1_CACHE_BYTES
-#define L1_CACHE_SHIFT_MAX	7 /* largest L1 which this arch supports */
 
 #if defined(__powerpc64__) && !defined(__ASSEMBLY__)
 struct ppc64_caches {
diff --git a/include/asm-powerpc/dma-mapping.h b/include/asm-powerpc/dma-mapping.h
index 59a80163f7..a96e5742ca 100644
--- a/include/asm-powerpc/dma-mapping.h
+++ b/include/asm-powerpc/dma-mapping.h
@@ -229,7 +229,7 @@ static inline int dma_get_cache_alignment(void)
 #ifdef CONFIG_PPC64
 	/* no easy way to get cache size on all processors, so return
 	 * the maximum possible, to be safe */
-	return (1 << L1_CACHE_SHIFT_MAX);
+	return (1 << INTERNODE_CACHE_SHIFT);
 #else
 	/*
 	 * Each processor family will define its own L1_CACHE_SHIFT,
diff --git a/include/asm-s390/cache.h b/include/asm-s390/cache.h
index 29845378b2..e20cdd9074 100644
--- a/include/asm-s390/cache.h
+++ b/include/asm-s390/cache.h
@@ -13,7 +13,6 @@
 
 #define L1_CACHE_BYTES     256
 #define L1_CACHE_SHIFT     8
-#define L1_CACHE_SHIFT_MAX 8	/* largest L1 which this arch supports */
 
 #define ARCH_KMALLOC_MINALIGN	8
 
diff --git a/include/asm-sh/cache.h b/include/asm-sh/cache.h
index 9b4dd6d821..656fdfe9e8 100644
--- a/include/asm-sh/cache.h
+++ b/include/asm-sh/cache.h
@@ -22,8 +22,6 @@
 
 #define L1_CACHE_ALIGN(x)	(((x)+(L1_CACHE_BYTES-1))&~(L1_CACHE_BYTES-1))
 
-#define L1_CACHE_SHIFT_MAX 	5	/* largest L1 which this arch supports */
-
 struct cache_info {
 	unsigned int ways;
 	unsigned int sets;
diff --git a/include/asm-sh64/cache.h b/include/asm-sh64/cache.h
index f54e85e8a4..a4f36f0036 100644
--- a/include/asm-sh64/cache.h
+++ b/include/asm-sh64/cache.h
@@ -20,8 +20,6 @@
 #define L1_CACHE_ALIGN_MASK	(~(L1_CACHE_BYTES - 1))
 #define L1_CACHE_ALIGN(x)	(((x)+(L1_CACHE_BYTES - 1)) & L1_CACHE_ALIGN_MASK)
 #define L1_CACHE_SIZE_BYTES	(L1_CACHE_BYTES << 10)
-/* Largest L1 which this arch supports */
-#define L1_CACHE_SHIFT_MAX	5
 
 #ifdef MODULE
 #define __cacheline_aligned __attribute__((__aligned__(L1_CACHE_BYTES)))
diff --git a/include/asm-sparc/cache.h b/include/asm-sparc/cache.h
index a10522cb21..cb971e88ae 100644
--- a/include/asm-sparc/cache.h
+++ b/include/asm-sparc/cache.h
@@ -13,7 +13,6 @@
 #define L1_CACHE_SHIFT 5
 #define L1_CACHE_BYTES 32
 #define L1_CACHE_ALIGN(x) ((((x)+(L1_CACHE_BYTES-1))&~(L1_CACHE_BYTES-1)))
-#define L1_CACHE_SHIFT_MAX 5	/* largest L1 which this arch supports */
 
 #define SMP_CACHE_BYTES 32
 
diff --git a/include/asm-sparc64/cache.h b/include/asm-sparc64/cache.h
index ade5ec3bfd..f7d35a2ae9 100644
--- a/include/asm-sparc64/cache.h
+++ b/include/asm-sparc64/cache.h
@@ -9,7 +9,6 @@
 #define        L1_CACHE_BYTES	32 /* Two 16-byte sub-blocks per line. */
 
 #define        L1_CACHE_ALIGN(x)       (((x)+(L1_CACHE_BYTES-1))&~(L1_CACHE_BYTES-1))
-#define		L1_CACHE_SHIFT_MAX 5	/* largest L1 which this arch supports */
 
 #define        SMP_CACHE_BYTES_SHIFT	6
 #define        SMP_CACHE_BYTES		(1 << SMP_CACHE_BYTES_SHIFT) /* L2 cache line size. */
diff --git a/include/asm-um/cache.h b/include/asm-um/cache.h
index a10602a5b2..3d05870755 100644
--- a/include/asm-um/cache.h
+++ b/include/asm-um/cache.h
@@ -13,9 +13,6 @@
 # define L1_CACHE_SHIFT		5
 #endif
 
-/* XXX: this is valid for x86 and x86_64. */
-#define L1_CACHE_SHIFT_MAX	7	/* largest L1 which this arch supports */
-
 #define L1_CACHE_BYTES		(1 << L1_CACHE_SHIFT)
 
 #endif
diff --git a/include/asm-v850/cache.h b/include/asm-v850/cache.h
index cbf9096e85..8832c7ea32 100644
--- a/include/asm-v850/cache.h
+++ b/include/asm-v850/cache.h
@@ -23,6 +23,4 @@
 #define L1_CACHE_SHIFT		4
 #endif
 
-#define L1_CACHE_SHIFT_MAX	L1_CACHE_SHIFT
-
 #endif /* __V850_CACHE_H__ */
diff --git a/include/asm-x86_64/cache.h b/include/asm-x86_64/cache.h
index 33e5342412..b4a2401de7 100644
--- a/include/asm-x86_64/cache.h
+++ b/include/asm-x86_64/cache.h
@@ -9,6 +9,5 @@
 /* L1 cache line size */
 #define L1_CACHE_SHIFT	(CONFIG_X86_L1_CACHE_SHIFT)
 #define L1_CACHE_BYTES	(1 << L1_CACHE_SHIFT)
-#define L1_CACHE_SHIFT_MAX 7	/* largest L1 which this arch supports */
 
 #endif
-- 
cgit v1.2.2


From b2de464f7f0006bec162559a8db161869d32ee93 Mon Sep 17 00:00:00 2001
From: Woody Suwalski <woodys@xandros.com>
Date: Sun, 8 Jan 2006 01:01:29 -0800
Subject: [PATCH] ARM: Netwinder ds1620 driver needs an export to be built as
 module

ds1620 module is using gpio_read symbol, so works only if "built-in" symbol
needs to be exported from the kernel image

Signed-off-by: Woody Suwalski <woodys@xandros.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/arm/mach-footbridge/netwinder-hw.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm/mach-footbridge/netwinder-hw.c b/arch/arm/mach-footbridge/netwinder-hw.c
index 775f85fc85..9e563de465 100644
--- a/arch/arm/mach-footbridge/netwinder-hw.c
+++ b/arch/arm/mach-footbridge/netwinder-hw.c
@@ -601,6 +601,7 @@ EXPORT_SYMBOL(gpio_lock);
 EXPORT_SYMBOL(gpio_modify_op);
 EXPORT_SYMBOL(gpio_modify_io);
 EXPORT_SYMBOL(cpld_modify);
+EXPORT_SYMBOL(gpio_read);
 
 /*
  * Initialise any other hardware after we've got the PCI bus
-- 
cgit v1.2.2


From 0805d89c151b4800eade4c2f50d39c5253d7d054 Mon Sep 17 00:00:00 2001
From: Gennady Sharapov <Gennady.V.Sharapov@intel.com>
Date: Sun, 8 Jan 2006 01:01:29 -0800
Subject: [PATCH] uml: move libc-dependent code from signal_user.c

The serial UML OS-abstraction layer patch (um/kernel dir).

This moves all systemcalls from signal_user.c file under os-Linux dir

Signed-off-by: Gennady Sharapov <Gennady.V.Sharapov@intel.com>
Signed-off-by: Jeff Dike <jdike@addtoit.com>
Cc: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/include/kern_util.h        |   2 -
 arch/um/include/os.h               |  10 +++
 arch/um/include/signal_user.h      |  28 -------
 arch/um/kernel/Makefile            |   2 +-
 arch/um/kernel/irq_user.c          |   1 -
 arch/um/kernel/process_kern.c      |   1 -
 arch/um/kernel/signal_kern.c       |   1 -
 arch/um/kernel/signal_user.c       | 157 ------------------------------------
 arch/um/kernel/skas/include/skas.h |   1 -
 arch/um/kernel/skas/process.c      |  11 ---
 arch/um/kernel/skas/process_kern.c |   1 -
 arch/um/kernel/skas/trap_user.c    |   2 +-
 arch/um/kernel/time.c              |   2 +-
 arch/um/kernel/trap_user.c         |   1 -
 arch/um/kernel/tt/exec_kern.c      |   1 -
 arch/um/kernel/tt/process_kern.c   |   1 -
 arch/um/kernel/tt/tracer.c         |   1 -
 arch/um/kernel/tt/trap_user.c      |   2 +-
 arch/um/os-Linux/main.c            |   1 -
 arch/um/os-Linux/process.c         |   1 -
 arch/um/os-Linux/signal.c          | 158 ++++++++++++++++++++++++++++++++++---
 arch/um/os-Linux/start_up.c        |   1 -
 arch/um/os-Linux/tt.c              |   1 -
 arch/um/sys-i386/signal.c          |   1 -
 24 files changed, 162 insertions(+), 226 deletions(-)
 delete mode 100644 arch/um/include/signal_user.h
 delete mode 100644 arch/um/kernel/signal_user.c

diff --git a/arch/um/include/kern_util.h b/arch/um/include/kern_util.h
index e5fec55701..58c0b10fcf 100644
--- a/arch/um/include/kern_util.h
+++ b/arch/um/include/kern_util.h
@@ -51,8 +51,6 @@ extern void timer_handler(int sig, union uml_pt_regs *regs);
 extern int set_signals(int enable);
 extern void force_sigbus(void);
 extern int pid_to_processor_id(int pid);
-extern void block_signals(void);
-extern void unblock_signals(void);
 extern void deliver_signals(void *t);
 extern int next_syscall_index(int max);
 extern int next_trap_index(int max);
diff --git a/arch/um/include/os.h b/arch/um/include/os.h
index c279ee6d89..cfc806e461 100644
--- a/arch/um/include/os.h
+++ b/arch/um/include/os.h
@@ -219,4 +219,14 @@ extern int umid_file_name(char *name, char *buf, int len);
 extern int set_umid(char *name);
 extern char *get_umid(void);
 
+/* signal.c */
+extern void set_sigstack(void *sig_stack, int size);
+extern void remove_sigstack(void);
+extern void set_handler(int sig, void (*handler)(int), int flags, ...);
+extern int change_sig(int signal, int on);
+extern void block_signals(void);
+extern void unblock_signals(void);
+extern int get_signals(void);
+extern int set_signals(int enable);
+
 #endif
diff --git a/arch/um/include/signal_user.h b/arch/um/include/signal_user.h
deleted file mode 100644
index b075e543d8..0000000000
--- a/arch/um/include/signal_user.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* 
- * Copyright (C) 2001 Jeff Dike (jdike@karaya.com)
- * Licensed under the GPL
- */
-
-#ifndef __SIGNAL_USER_H__
-#define __SIGNAL_USER_H__
-
-extern int signal_stack_size;
-
-extern int change_sig(int signal, int on);
-extern void set_sigstack(void *stack, int size);
-extern void set_handler(int sig, void (*handler)(int), int flags, ...);
-extern int set_signals(int enable);
-extern int get_signals(void);
-
-#endif
-
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/arch/um/kernel/Makefile b/arch/um/kernel/Makefile
index 6f7700593a..9ce6c57747 100644
--- a/arch/um/kernel/Makefile
+++ b/arch/um/kernel/Makefile
@@ -9,7 +9,7 @@ clean-files :=
 obj-y = config.o exec_kern.o exitcode.o \
 	init_task.o irq.o irq_user.o ksyms.o mem.o physmem.o \
 	process_kern.o ptrace.o reboot.o resource.o sigio_user.o sigio_kern.o \
-	signal_kern.o signal_user.o smp.o syscall_kern.o sysrq.o time.o \
+	signal_kern.o smp.o syscall_kern.o sysrq.o time.o \
 	time_kern.o tlb.o trap_kern.o trap_user.o uaccess.o um_arch.o umid.o \
 	user_util.o
 
diff --git a/arch/um/kernel/irq_user.c b/arch/um/kernel/irq_user.c
index 50a2aa35cd..0e32f5f4a8 100644
--- a/arch/um/kernel/irq_user.c
+++ b/arch/um/kernel/irq_user.c
@@ -15,7 +15,6 @@
 #include "kern_util.h"
 #include "user.h"
 #include "process.h"
-#include "signal_user.h"
 #include "sigio.h"
 #include "irq_user.h"
 #include "os.h"
diff --git a/arch/um/kernel/process_kern.c b/arch/um/kernel/process_kern.c
index 651abf255b..d2d3f25677 100644
--- a/arch/um/kernel/process_kern.c
+++ b/arch/um/kernel/process_kern.c
@@ -36,7 +36,6 @@
 #include "kern_util.h"
 #include "kern.h"
 #include "signal_kern.h"
-#include "signal_user.h"
 #include "init.h"
 #include "irq_user.h"
 #include "mem_user.h"
diff --git a/arch/um/kernel/signal_kern.c b/arch/um/kernel/signal_kern.c
index 03618bd13d..7b0e0e81c1 100644
--- a/arch/um/kernel/signal_kern.c
+++ b/arch/um/kernel/signal_kern.c
@@ -22,7 +22,6 @@
 #include "asm/ucontext.h"
 #include "kern_util.h"
 #include "signal_kern.h"
-#include "signal_user.h"
 #include "kern.h"
 #include "frame_kern.h"
 #include "sigcontext.h"
diff --git a/arch/um/kernel/signal_user.c b/arch/um/kernel/signal_user.c
deleted file mode 100644
index 62f457835f..0000000000
--- a/arch/um/kernel/signal_user.c
+++ /dev/null
@@ -1,157 +0,0 @@
-/* 
- * Copyright (C) 2000 Jeff Dike (jdike@karaya.com)
- * Licensed under the GPL
- */
-
-#include <stdio.h>
-#include <unistd.h>
-#include <stdlib.h>
-#include <signal.h>
-#include <errno.h>
-#include <stdarg.h>
-#include <string.h>
-#include <sys/mman.h>
-#include "user_util.h"
-#include "kern_util.h"
-#include "user.h"
-#include "signal_user.h"
-#include "signal_kern.h"
-#include "sysdep/sigcontext.h"
-#include "sigcontext.h"
-
-void set_sigstack(void *sig_stack, int size)
-{
-	stack_t stack = ((stack_t) { .ss_flags	= 0,
-				     .ss_sp	= (__ptr_t) sig_stack,
-				     .ss_size 	= size - sizeof(void *) });
-
-	if(sigaltstack(&stack, NULL) != 0)
-		panic("enabling signal stack failed, errno = %d\n", errno);
-}
-
-void set_handler(int sig, void (*handler)(int), int flags, ...)
-{
-	struct sigaction action;
-	va_list ap;
-	int mask;
-
-	va_start(ap, flags);
-	action.sa_handler = handler;
-	sigemptyset(&action.sa_mask);
-	while((mask = va_arg(ap, int)) != -1){
-		sigaddset(&action.sa_mask, mask);
-	}
-	va_end(ap);
-	action.sa_flags = flags;
-	action.sa_restorer = NULL;
-	if(sigaction(sig, &action, NULL) < 0)
-		panic("sigaction failed");
-}
-
-int change_sig(int signal, int on)
-{
-	sigset_t sigset, old;
-
-	sigemptyset(&sigset);
-	sigaddset(&sigset, signal);
-	sigprocmask(on ? SIG_UNBLOCK : SIG_BLOCK, &sigset, &old);
-	return(!sigismember(&old, signal));
-}
-
-/* Both here and in set/get_signal we don't touch SIGPROF, because we must not
- * disable profiling; it's safe because the profiling code does not interact
- * with the kernel code at all.*/
-
-static void change_signals(int type)
-{
-	sigset_t mask;
-
-	sigemptyset(&mask);
-	sigaddset(&mask, SIGVTALRM);
-	sigaddset(&mask, SIGALRM);
-	sigaddset(&mask, SIGIO);
-	if(sigprocmask(type, &mask, NULL) < 0)
-		panic("Failed to change signal mask - errno = %d", errno);
-}
-
-void block_signals(void)
-{
-	change_signals(SIG_BLOCK);
-}
-
-void unblock_signals(void)
-{
-	change_signals(SIG_UNBLOCK);
-}
-
-/* These are the asynchronous signals.  SIGVTALRM and SIGARLM are handled
- * together under SIGVTALRM_BIT.  SIGPROF is excluded because we want to
- * be able to profile all of UML, not just the non-critical sections.  If
- * profiling is not thread-safe, then that is not my problem.  We can disable
- * profiling when SMP is enabled in that case.
- */
-#define SIGIO_BIT 0
-#define SIGVTALRM_BIT 1
-
-static int enable_mask(sigset_t *mask)
-{
-	int sigs;
-
-	sigs = sigismember(mask, SIGIO) ? 0 : 1 << SIGIO_BIT;
-	sigs |= sigismember(mask, SIGVTALRM) ? 0 : 1 << SIGVTALRM_BIT;
-	sigs |= sigismember(mask, SIGALRM) ? 0 : 1 << SIGVTALRM_BIT;
-	return(sigs);
-}
-
-int get_signals(void)
-{
-	sigset_t mask;
-	
-	if(sigprocmask(SIG_SETMASK, NULL, &mask) < 0)
-		panic("Failed to get signal mask");
-	return(enable_mask(&mask));
-}
-
-int set_signals(int enable)
-{
-	sigset_t mask;
-	int ret;
-
-	sigemptyset(&mask);
-	if(enable & (1 << SIGIO_BIT)) 
-		sigaddset(&mask, SIGIO);
-	if(enable & (1 << SIGVTALRM_BIT)){
-		sigaddset(&mask, SIGVTALRM);
-		sigaddset(&mask, SIGALRM);
-	}
-
-	/* This is safe - sigprocmask is guaranteed to copy locally the
-	 * value of new_set, do his work and then, at the end, write to
-	 * old_set.
-	 */
-	if(sigprocmask(SIG_UNBLOCK, &mask, &mask) < 0)
-		panic("Failed to enable signals");
-	ret = enable_mask(&mask);
-	sigemptyset(&mask);
-	if((enable & (1 << SIGIO_BIT)) == 0) 
-		sigaddset(&mask, SIGIO);
-	if((enable & (1 << SIGVTALRM_BIT)) == 0){
-		sigaddset(&mask, SIGVTALRM);
-		sigaddset(&mask, SIGALRM);
-	}
-	if(sigprocmask(SIG_BLOCK, &mask, NULL) < 0)
-		panic("Failed to block signals");
-
-	return(ret);
-}
-
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/arch/um/kernel/skas/include/skas.h b/arch/um/kernel/skas/include/skas.h
index daa2f85b68..01d489de39 100644
--- a/arch/um/kernel/skas/include/skas.h
+++ b/arch/um/kernel/skas/include/skas.h
@@ -22,7 +22,6 @@ extern int start_idle_thread(void *stack, void *switch_buf_ptr,
 extern int user_thread(unsigned long stack, int flags);
 extern void userspace(union uml_pt_regs *regs);
 extern void new_thread_proc(void *stack, void (*handler)(int sig));
-extern void remove_sigstack(void);
 extern void new_thread_handler(int sig);
 extern void handle_syscall(union uml_pt_regs *regs);
 extern int map(struct mm_id * mm_idp, unsigned long virt,
diff --git a/arch/um/kernel/skas/process.c b/arch/um/kernel/skas/process.c
index 599d679bd4..9264d4021d 100644
--- a/arch/um/kernel/skas/process.c
+++ b/arch/um/kernel/skas/process.c
@@ -31,7 +31,6 @@
 #include "proc_mm.h"
 #include "skas_ptrace.h"
 #include "chan_user.h"
-#include "signal_user.h"
 #include "registers.h"
 #include "mem.h"
 #include "uml-config.h"
@@ -514,16 +513,6 @@ int start_idle_thread(void *stack, void *switch_buf_ptr, void **fork_buf_ptr)
 	siglongjmp(**switch_buf, 1);
 }
 
-void remove_sigstack(void)
-{
-	stack_t stack = ((stack_t) { .ss_flags	= SS_DISABLE,
-				     .ss_sp	= NULL,
-				     .ss_size	= 0 });
-
-	if(sigaltstack(&stack, NULL) != 0)
-		panic("disabling signal stack failed, errno = %d\n", errno);
-}
-
 void initial_thread_cb_skas(void (*proc)(void *), void *arg)
 {
 	sigjmp_buf here;
diff --git a/arch/um/kernel/skas/process_kern.c b/arch/um/kernel/skas/process_kern.c
index 9c99025396..09790ccb16 100644
--- a/arch/um/kernel/skas/process_kern.c
+++ b/arch/um/kernel/skas/process_kern.c
@@ -14,7 +14,6 @@
 #include "asm/atomic.h"
 #include "kern_util.h"
 #include "time_user.h"
-#include "signal_user.h"
 #include "skas.h"
 #include "os.h"
 #include "user_util.h"
diff --git a/arch/um/kernel/skas/trap_user.c b/arch/um/kernel/skas/trap_user.c
index 9950a6716f..28403d2caf 100644
--- a/arch/um/kernel/skas/trap_user.c
+++ b/arch/um/kernel/skas/trap_user.c
@@ -5,7 +5,6 @@
 
 #include <signal.h>
 #include <errno.h>
-#include "signal_user.h"
 #include "user_util.h"
 #include "kern_util.h"
 #include "task.h"
@@ -14,6 +13,7 @@
 #include "ptrace_user.h"
 #include "sysdep/ptrace.h"
 #include "sysdep/ptrace_user.h"
+#include "os.h"
 
 void sig_handler_common_skas(int sig, void *sc_ptr)
 {
diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c
index c40b611e3d..11f518a7e1 100644
--- a/arch/um/kernel/time.c
+++ b/arch/um/kernel/time.c
@@ -14,9 +14,9 @@
 #include "kern_util.h"
 #include "user.h"
 #include "process.h"
-#include "signal_user.h"
 #include "time_user.h"
 #include "kern_constants.h"
+#include "os.h"
 
 /* XXX This really needs to be declared and initialized in a kernel file since
  * it's in <linux/time.h>
diff --git a/arch/um/kernel/trap_user.c b/arch/um/kernel/trap_user.c
index e9ccd6b8d3..e551107251 100644
--- a/arch/um/kernel/trap_user.c
+++ b/arch/um/kernel/trap_user.c
@@ -17,7 +17,6 @@
 #include "sigcontext.h"
 #include "sysdep/sigcontext.h"
 #include "irq_user.h"
-#include "signal_user.h"
 #include "time_user.h"
 #include "task.h"
 #include "mode.h"
diff --git a/arch/um/kernel/tt/exec_kern.c b/arch/um/kernel/tt/exec_kern.c
index 065b504a65..136e54c47d 100644
--- a/arch/um/kernel/tt/exec_kern.c
+++ b/arch/um/kernel/tt/exec_kern.c
@@ -14,7 +14,6 @@
 #include "kern_util.h"
 #include "irq_user.h"
 #include "time_user.h"
-#include "signal_user.h"
 #include "mem_user.h"
 #include "os.h"
 #include "tlb.h"
diff --git a/arch/um/kernel/tt/process_kern.c b/arch/um/kernel/tt/process_kern.c
index cfaa373a6e..14d4622a5f 100644
--- a/arch/um/kernel/tt/process_kern.c
+++ b/arch/um/kernel/tt/process_kern.c
@@ -13,7 +13,6 @@
 #include "asm/ptrace.h"
 #include "asm/tlbflush.h"
 #include "irq_user.h"
-#include "signal_user.h"
 #include "kern_util.h"
 #include "user_util.h"
 #include "os.h"
diff --git a/arch/um/kernel/tt/tracer.c b/arch/um/kernel/tt/tracer.c
index d11e7399d7..71daae24e4 100644
--- a/arch/um/kernel/tt/tracer.c
+++ b/arch/um/kernel/tt/tracer.c
@@ -19,7 +19,6 @@
 #include "sigcontext.h"
 #include "sysdep/sigcontext.h"
 #include "os.h"
-#include "signal_user.h"
 #include "user_util.h"
 #include "mem_user.h"
 #include "process.h"
diff --git a/arch/um/kernel/tt/trap_user.c b/arch/um/kernel/tt/trap_user.c
index fc108615be..7501900f89 100644
--- a/arch/um/kernel/tt/trap_user.c
+++ b/arch/um/kernel/tt/trap_user.c
@@ -8,11 +8,11 @@
 #include <signal.h>
 #include "sysdep/ptrace.h"
 #include "sysdep/sigcontext.h"
-#include "signal_user.h"
 #include "user_util.h"
 #include "kern_util.h"
 #include "task.h"
 #include "tt.h"
+#include "os.h"
 
 void sig_handler_common_tt(int sig, void *sc_ptr)
 {
diff --git a/arch/um/os-Linux/main.c b/arch/um/os-Linux/main.c
index 23da27d225..172c847445 100644
--- a/arch/um/os-Linux/main.c
+++ b/arch/um/os-Linux/main.c
@@ -16,7 +16,6 @@
 #include "user_util.h"
 #include "kern_util.h"
 #include "mem_user.h"
-#include "signal_user.h"
 #include "time_user.h"
 #include "irq_user.h"
 #include "user.h"
diff --git a/arch/um/os-Linux/process.c b/arch/um/os-Linux/process.c
index d9c52387c4..39815c6b5e 100644
--- a/arch/um/os-Linux/process.c
+++ b/arch/um/os-Linux/process.c
@@ -15,7 +15,6 @@
 #include "os.h"
 #include "user.h"
 #include "user_util.h"
-#include "signal_user.h"
 #include "process.h"
 #include "irq_user.h"
 #include "kern_util.h"
diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c
index c7bfd5ee39..c1f46a0fef 100644
--- a/arch/um/os-Linux/signal.c
+++ b/arch/um/os-Linux/signal.c
@@ -4,9 +4,22 @@
  */
 
 #include <signal.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <stdarg.h>
+#include <string.h>
+#include <sys/mman.h>
+#include "user_util.h"
+#include "kern_util.h"
+#include "user.h"
+#include "signal_kern.h"
+#include "sysdep/sigcontext.h"
+#include "sysdep/signal.h"
+#include "sigcontext.h"
 #include "time_user.h"
 #include "mode.h"
-#include "sysdep/signal.h"
 
 void sig_handler(ARCH_SIGHDLR_PARAM)
 {
@@ -36,13 +49,138 @@ void alarm_handler(ARCH_SIGHDLR_PARAM)
 		switch_timers(1);
 }
 
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
+void set_sigstack(void *sig_stack, int size)
+{
+	stack_t stack = ((stack_t) { .ss_flags	= 0,
+				     .ss_sp	= (__ptr_t) sig_stack,
+				     .ss_size 	= size - sizeof(void *) });
+
+	if(sigaltstack(&stack, NULL) != 0)
+		panic("enabling signal stack failed, errno = %d\n", errno);
+}
+
+void remove_sigstack(void)
+{
+	stack_t stack = ((stack_t) { .ss_flags	= SS_DISABLE,
+				     .ss_sp	= NULL,
+				     .ss_size	= 0 });
+
+	if(sigaltstack(&stack, NULL) != 0)
+		panic("disabling signal stack failed, errno = %d\n", errno);
+}
+
+void set_handler(int sig, void (*handler)(int), int flags, ...)
+{
+	struct sigaction action;
+	va_list ap;
+	int mask;
+
+	va_start(ap, flags);
+	action.sa_handler = handler;
+	sigemptyset(&action.sa_mask);
+	while((mask = va_arg(ap, int)) != -1){
+		sigaddset(&action.sa_mask, mask);
+	}
+	va_end(ap);
+	action.sa_flags = flags;
+	action.sa_restorer = NULL;
+	if(sigaction(sig, &action, NULL) < 0)
+		panic("sigaction failed");
+}
+
+int change_sig(int signal, int on)
+{
+	sigset_t sigset, old;
+
+	sigemptyset(&sigset);
+	sigaddset(&sigset, signal);
+	sigprocmask(on ? SIG_UNBLOCK : SIG_BLOCK, &sigset, &old);
+	return(!sigismember(&old, signal));
+}
+
+/* Both here and in set/get_signal we don't touch SIGPROF, because we must not
+ * disable profiling; it's safe because the profiling code does not interact
+ * with the kernel code at all.*/
+
+static void change_signals(int type)
+{
+	sigset_t mask;
+
+	sigemptyset(&mask);
+	sigaddset(&mask, SIGVTALRM);
+	sigaddset(&mask, SIGALRM);
+	sigaddset(&mask, SIGIO);
+	if(sigprocmask(type, &mask, NULL) < 0)
+		panic("Failed to change signal mask - errno = %d", errno);
+}
+
+void block_signals(void)
+{
+	change_signals(SIG_BLOCK);
+}
+
+void unblock_signals(void)
+{
+	change_signals(SIG_UNBLOCK);
+}
+
+/* These are the asynchronous signals.  SIGVTALRM and SIGARLM are handled
+ * together under SIGVTALRM_BIT.  SIGPROF is excluded because we want to
+ * be able to profile all of UML, not just the non-critical sections.  If
+ * profiling is not thread-safe, then that is not my problem.  We can disable
+ * profiling when SMP is enabled in that case.
  */
+#define SIGIO_BIT 0
+#define SIGVTALRM_BIT 1
+
+static int enable_mask(sigset_t *mask)
+{
+	int sigs;
+
+	sigs = sigismember(mask, SIGIO) ? 0 : 1 << SIGIO_BIT;
+	sigs |= sigismember(mask, SIGVTALRM) ? 0 : 1 << SIGVTALRM_BIT;
+	sigs |= sigismember(mask, SIGALRM) ? 0 : 1 << SIGVTALRM_BIT;
+	return(sigs);
+}
+
+int get_signals(void)
+{
+	sigset_t mask;
+
+	if(sigprocmask(SIG_SETMASK, NULL, &mask) < 0)
+		panic("Failed to get signal mask");
+	return(enable_mask(&mask));
+}
+
+int set_signals(int enable)
+{
+	sigset_t mask;
+	int ret;
+
+	sigemptyset(&mask);
+	if(enable & (1 << SIGIO_BIT))
+		sigaddset(&mask, SIGIO);
+	if(enable & (1 << SIGVTALRM_BIT)){
+		sigaddset(&mask, SIGVTALRM);
+		sigaddset(&mask, SIGALRM);
+	}
+
+	/* This is safe - sigprocmask is guaranteed to copy locally the
+	 * value of new_set, do his work and then, at the end, write to
+	 * old_set.
+	 */
+	if(sigprocmask(SIG_UNBLOCK, &mask, &mask) < 0)
+		panic("Failed to enable signals");
+	ret = enable_mask(&mask);
+	sigemptyset(&mask);
+	if((enable & (1 << SIGIO_BIT)) == 0)
+		sigaddset(&mask, SIGIO);
+	if((enable & (1 << SIGVTALRM_BIT)) == 0){
+		sigaddset(&mask, SIGVTALRM);
+		sigaddset(&mask, SIGALRM);
+	}
+	if(sigprocmask(SIG_BLOCK, &mask, NULL) < 0)
+		panic("Failed to block signals");
+
+	return(ret);
+}
diff --git a/arch/um/os-Linux/start_up.c b/arch/um/os-Linux/start_up.c
index 29a9e3f437..b47e5e71d1 100644
--- a/arch/um/os-Linux/start_up.c
+++ b/arch/um/os-Linux/start_up.c
@@ -24,7 +24,6 @@
 #include "kern_util.h"
 #include "user.h"
 #include "signal_kern.h"
-#include "signal_user.h"
 #include "sysdep/ptrace.h"
 #include "sysdep/sigcontext.h"
 #include "irq_user.h"
diff --git a/arch/um/os-Linux/tt.c b/arch/um/os-Linux/tt.c
index a6db887793..37828e5b35 100644
--- a/arch/um/os-Linux/tt.c
+++ b/arch/um/os-Linux/tt.c
@@ -23,7 +23,6 @@
 #include "kern_util.h"
 #include "user.h"
 #include "signal_kern.h"
-#include "signal_user.h"
 #include "sysdep/ptrace.h"
 #include "sysdep/sigcontext.h"
 #include "irq_user.h"
diff --git a/arch/um/sys-i386/signal.c b/arch/um/sys-i386/signal.c
index 16bc19928b..7cd1a82dc8 100644
--- a/arch/um/sys-i386/signal.c
+++ b/arch/um/sys-i386/signal.c
@@ -10,7 +10,6 @@
 #include "asm/uaccess.h"
 #include "asm/unistd.h"
 #include "frame_kern.h"
-#include "signal_user.h"
 #include "sigcontext.h"
 #include "registers.h"
 #include "mode.h"
-- 
cgit v1.2.2


From ea2ba7dc3dd3f85034e6da6abacc813d723a2ad5 Mon Sep 17 00:00:00 2001
From: Gennady Sharapov <Gennady.V.Sharapov@intel.com>
Date: Sun, 8 Jan 2006 01:01:31 -0800
Subject: [PATCH] uml: move libc-dependent code from trap_user.c

The serial UML OS-abstraction layer patch (um/kernel dir).

This moves all systemcalls from trap_user.c file under os-Linux dir

Signed-off-by: Gennady Sharapov <Gennady.V.Sharapov@intel.com>
Signed-off-by: Jeff Dike <jdike@addtoit.com>
Cc: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/include/kern_util.h     | 17 ++++++++-
 arch/um/include/os.h            |  6 ++++
 arch/um/include/user_util.h     | 10 ++----
 arch/um/kernel/skas/Makefile    |  2 +-
 arch/um/kernel/skas/trap_user.c | 78 -----------------------------------------
 arch/um/kernel/trap_kern.c      | 11 +++++-
 arch/um/kernel/trap_user.c      | 65 +++++++---------------------------
 arch/um/kernel/tt/trap_user.c   | 14 +++++---
 arch/um/kernel/um_arch.c        |  7 +++-
 arch/um/os-Linux/Makefile       |  8 +++--
 arch/um/os-Linux/skas/Makefile  | 10 ++++++
 arch/um/os-Linux/skas/trap.c    | 73 ++++++++++++++++++++++++++++++++++++++
 arch/um/os-Linux/trap.c         | 17 +++++++++
 arch/um/os-Linux/tt.c           | 14 ++++++++
 14 files changed, 182 insertions(+), 150 deletions(-)
 delete mode 100644 arch/um/kernel/skas/trap_user.c
 create mode 100644 arch/um/os-Linux/skas/Makefile
 create mode 100644 arch/um/os-Linux/skas/trap.c
 create mode 100644 arch/um/os-Linux/trap.c

diff --git a/arch/um/include/kern_util.h b/arch/um/include/kern_util.h
index 58c0b10fcf..8f4e46d677 100644
--- a/arch/um/include/kern_util.h
+++ b/arch/um/include/kern_util.h
@@ -1,4 +1,4 @@
-/* 
+/*
  * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com)
  * Licensed under the GPL
  */
@@ -10,6 +10,19 @@
 #include "sysdep/ptrace.h"
 #include "sysdep/faultinfo.h"
 
+typedef void (*kern_hndl)(int, union uml_pt_regs *);
+
+struct kern_handlers {
+	kern_hndl relay_signal;
+	kern_hndl winch;
+	kern_hndl bus_handler;
+	kern_hndl page_fault;
+	kern_hndl sigio_handler;
+	kern_hndl timer_handler;
+};
+
+extern struct kern_handlers handlinfo_kern;
+
 extern int ncpus;
 extern char *linux_prog;
 extern char *gdb_init;
@@ -109,6 +122,8 @@ extern void arch_switch(void);
 extern void free_irq(unsigned int, void *);
 extern int um_in_interrupt(void);
 extern int cpu(void);
+extern void segv_handler(int sig, union uml_pt_regs *regs);
+extern void sigio_handler(int sig, union uml_pt_regs *regs);
 
 #endif
 
diff --git a/arch/um/include/os.h b/arch/um/include/os.h
index cfc806e461..dd72d66cf0 100644
--- a/arch/um/include/os.h
+++ b/arch/um/include/os.h
@@ -9,6 +9,8 @@
 #include "uml-config.h"
 #include "asm/types.h"
 #include "../os/include/file.h"
+#include "sysdep/ptrace.h"
+#include "kern_util.h"
 
 #define OS_TYPE_FILE 1 
 #define OS_TYPE_DIR 2 
@@ -229,4 +231,8 @@ extern void unblock_signals(void);
 extern int get_signals(void);
 extern int set_signals(int enable);
 
+/* trap.c */
+extern void os_fill_handlinfo(struct kern_handlers h);
+extern void do_longjmp(void *p, int val);
+
 #endif
diff --git a/arch/um/include/user_util.h b/arch/um/include/user_util.h
index b9984003e6..c1dbd77b07 100644
--- a/arch/um/include/user_util.h
+++ b/arch/um/include/user_util.h
@@ -1,4 +1,4 @@
-/* 
+/*
  * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com)
  * Licensed under the GPL
  */
@@ -23,12 +23,7 @@ struct cpu_task {
 
 extern struct cpu_task cpu_tasks[];
 
-struct signal_info {
-	void (*handler)(int, union uml_pt_regs *);
-	int is_irq;
-};
-
-extern struct signal_info sig_info[];
+extern void (*sig_info[])(int, union uml_pt_regs *);
 
 extern unsigned long low_physmem;
 extern unsigned long high_physmem;
@@ -64,7 +59,6 @@ extern void setup_machinename(char *machine_out);
 extern void setup_hostinfo(void);
 extern void do_exec(int old_pid, int new_pid);
 extern void tracer_panic(char *msg, ...);
-extern void do_longjmp(void *p, int val);
 extern int detach(int pid, int sig);
 extern int attach(int pid);
 extern void kill_child_dead(int pid);
diff --git a/arch/um/kernel/skas/Makefile b/arch/um/kernel/skas/Makefile
index 8de471b59c..7a9fc16d71 100644
--- a/arch/um/kernel/skas/Makefile
+++ b/arch/um/kernel/skas/Makefile
@@ -4,7 +4,7 @@
 #
 
 obj-y := clone.o exec_kern.o mem.o mem_user.o mmu.o process.o process_kern.o \
-	syscall.o tlb.o trap_user.o uaccess.o
+	syscall.o tlb.o uaccess.o
 
 USER_OBJS := process.o clone.o
 
diff --git a/arch/um/kernel/skas/trap_user.c b/arch/um/kernel/skas/trap_user.c
deleted file mode 100644
index 28403d2caf..0000000000
--- a/arch/um/kernel/skas/trap_user.c
+++ /dev/null
@@ -1,78 +0,0 @@
-/* 
- * Copyright (C) 2002 - 2003 Jeff Dike (jdike@addtoit.com)
- * Licensed under the GPL
- */
-
-#include <signal.h>
-#include <errno.h>
-#include "user_util.h"
-#include "kern_util.h"
-#include "task.h"
-#include "sigcontext.h"
-#include "skas.h"
-#include "ptrace_user.h"
-#include "sysdep/ptrace.h"
-#include "sysdep/ptrace_user.h"
-#include "os.h"
-
-void sig_handler_common_skas(int sig, void *sc_ptr)
-{
-	struct sigcontext *sc = sc_ptr;
-	struct skas_regs *r;
-	struct signal_info *info;
-	int save_errno = errno;
-	int save_user;
-
-	/* This is done because to allow SIGSEGV to be delivered inside a SEGV
-	 * handler.  This can happen in copy_user, and if SEGV is disabled,
-	 * the process will die.
-	 * XXX Figure out why this is better than SA_NODEFER
-	 */
-	if(sig == SIGSEGV)
-		change_sig(SIGSEGV, 1);
-
-	r = &TASK_REGS(get_current())->skas;
-	save_user = r->is_user;
-	r->is_user = 0;
-        if ( sig == SIGFPE || sig == SIGSEGV ||
-             sig == SIGBUS || sig == SIGILL ||
-             sig == SIGTRAP ) {
-                GET_FAULTINFO_FROM_SC(r->faultinfo, sc);
-        }
-
-	change_sig(SIGUSR1, 1);
-	info = &sig_info[sig];
-	if(!info->is_irq) unblock_signals();
-
-	(*info->handler)(sig, (union uml_pt_regs *) r);
-
-	errno = save_errno;
-	r->is_user = save_user;
-}
-
-extern int ptrace_faultinfo;
-
-void user_signal(int sig, union uml_pt_regs *regs, int pid)
-{
-	struct signal_info *info;
-        int segv = ((sig == SIGFPE) || (sig == SIGSEGV) || (sig == SIGBUS) ||
-                    (sig == SIGILL) || (sig == SIGTRAP));
-
-	if (segv)
-		get_skas_faultinfo(pid, &regs->skas.faultinfo);
-	info = &sig_info[sig];
-	(*info->handler)(sig, regs);
-
-	unblock_signals();
-}
-
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/arch/um/kernel/trap_kern.c b/arch/um/kernel/trap_kern.c
index 0d4c10a736..b79f805bdc 100644
--- a/arch/um/kernel/trap_kern.c
+++ b/arch/um/kernel/trap_kern.c
@@ -1,4 +1,4 @@
-/* 
+/*
  * Copyright (C) 2000, 2001 Jeff Dike (jdike@karaya.com)
  * Licensed under the GPL
  */
@@ -29,6 +29,7 @@
 #ifdef CONFIG_MODE_SKAS
 #include "skas.h"
 #endif
+#include "os.h"
 
 /* Note this is constrained to return 0, -EFAULT, -EACCESS, -ENOMEM by segv(). */
 int handle_page_fault(unsigned long address, unsigned long ip, 
@@ -125,6 +126,14 @@ out_of_memory:
 	goto out;
 }
 
+struct kern_handlers handlinfo_kern = {
+	.relay_signal = relay_signal,
+	.winch = winch,
+	.bus_handler = relay_signal,
+	.page_fault = segv_handler,
+	.sigio_handler = sigio_handler,
+	.timer_handler = timer_handler
+};
 /*
  * We give a *copy* of the faultinfo in the regs to segv.
  * This must be done, since nesting SEGVs could overwrite
diff --git a/arch/um/kernel/trap_user.c b/arch/um/kernel/trap_user.c
index e551107251..5590b5739d 100644
--- a/arch/um/kernel/trap_user.c
+++ b/arch/um/kernel/trap_user.c
@@ -1,4 +1,4 @@
-/* 
+/*
  * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com)
  * Licensed under the GPL
  */
@@ -25,20 +25,6 @@
 #include "user_util.h"
 #include "os.h"
 
-void kill_child_dead(int pid)
-{
-	kill(pid, SIGKILL);
-	kill(pid, SIGCONT);
-	do {
-		int n;
-		CATCH_EINTR(n = waitpid(pid, NULL, 0));
-		if (n > 0)
-			kill(pid, SIGCONT);
-		else
-			break;
-	} while(1);
-}
-
 void segv_handler(int sig, union uml_pt_regs *regs)
 {
         struct faultinfo * fi = UPT_FAULTINFO(regs);
@@ -55,43 +41,18 @@ void usr2_handler(int sig, union uml_pt_regs *regs)
 	CHOOSE_MODE(syscall_handler_tt(sig, regs), (void) 0);
 }
 
-struct signal_info sig_info[] = {
-	[ SIGTRAP ] { .handler 		= relay_signal,
-		      .is_irq 		= 0 },
-	[ SIGFPE ] { .handler 		= relay_signal,
-		     .is_irq 		= 0 },
-	[ SIGILL ] { .handler 		= relay_signal,
-		     .is_irq 		= 0 },
-	[ SIGWINCH ] { .handler		= winch,
-		       .is_irq		= 1 },
-	[ SIGBUS ] { .handler 		= bus_handler,
-		     .is_irq 		= 0 },
-	[ SIGSEGV] { .handler 		= segv_handler,
-		     .is_irq 		= 0 },
-	[ SIGIO ] { .handler 		= sigio_handler,
-		    .is_irq 		= 1 },
-	[ SIGVTALRM ] { .handler 	= timer_handler,
-			.is_irq 	= 1 },
-        [ SIGALRM ] { .handler          = timer_handler,
-                      .is_irq           = 1 },
-	[ SIGUSR2 ] { .handler 		= usr2_handler,
-		      .is_irq 		= 0 },
-};
+void (*sig_info[NSIG])(int, union uml_pt_regs *);
 
-void do_longjmp(void *b, int val)
+void os_fill_handlinfo(struct kern_handlers h)
 {
-	sigjmp_buf *buf = b;
-
-	siglongjmp(*buf, val);
+	sig_info[SIGTRAP] = h.relay_signal;
+	sig_info[SIGFPE] = h.relay_signal;
+	sig_info[SIGILL] = h.relay_signal;
+	sig_info[SIGWINCH] = h.winch;
+	sig_info[SIGBUS] = h.bus_handler;
+	sig_info[SIGSEGV] = h.page_fault;
+	sig_info[SIGIO] = h.sigio_handler;
+	sig_info[SIGVTALRM] = h.timer_handler;
+	sig_info[SIGALRM] = h.timer_handler;
+	sig_info[SIGUSR2] = usr2_handler;
 }
-
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/arch/um/kernel/tt/trap_user.c b/arch/um/kernel/tt/trap_user.c
index 7501900f89..a414c529fb 100644
--- a/arch/um/kernel/tt/trap_user.c
+++ b/arch/um/kernel/tt/trap_user.c
@@ -1,4 +1,4 @@
-/* 
+/*
  * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
  * Licensed under the GPL
  */
@@ -18,8 +18,8 @@ void sig_handler_common_tt(int sig, void *sc_ptr)
 {
 	struct sigcontext *sc = sc_ptr;
 	struct tt_regs save_regs, *r;
-	struct signal_info *info;
 	int save_errno = errno, is_user;
+	void (*handler)(int, union uml_pt_regs *);
 
 	/* This is done because to allow SIGSEGV to be delivered inside a SEGV
 	 * handler.  This can happen in copy_user, and if SEGV is disabled,
@@ -40,10 +40,14 @@ void sig_handler_common_tt(int sig, void *sc_ptr)
 	if(sig != SIGUSR2) 
 		r->syscall = -1;
 
-	info = &sig_info[sig];
-	if(!info->is_irq) unblock_signals();
+	handler = sig_info[sig];
+
+	/* unblock SIGALRM, SIGVTALRM, SIGIO if sig isn't IRQ signal */
+	if (sig != SIGIO && sig != SIGWINCH &&
+	    sig != SIGVTALRM && sig != SIGALRM)
+		unblock_signals();
 
-	(*info->handler)(sig, (union uml_pt_regs *) r);
+	handler(sig, (union uml_pt_regs *) r);
 
 	if(is_user){
 		interrupt_end();
diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c
index 26626b2b91..73747ac197 100644
--- a/arch/um/kernel/um_arch.c
+++ b/arch/um/kernel/um_arch.c
@@ -1,4 +1,4 @@
-/* 
+/*
  * Copyright (C) 2000, 2002 Jeff Dike (jdike@karaya.com)
  * Licensed under the GPL
  */
@@ -363,6 +363,11 @@ int linux_main(int argc, char **argv)
 	uml_start = CHOOSE_MODE_PROC(set_task_sizes_tt, set_task_sizes_skas, 0,
 				     &host_task_size, &task_size);
 
+	/*
+ 	 * Setting up handlers to 'sig_info' struct
+ 	 */
+	os_fill_handlinfo(handlinfo_kern);
+
 	brk_start = (unsigned long) sbrk(0);
 	CHOOSE_MODE_PROC(before_mem_tt, before_mem_skas, brk_start);
 	/* Increase physical memory size for exec-shield users
diff --git a/arch/um/os-Linux/Makefile b/arch/um/os-Linux/Makefile
index 11e30b13e3..40c7d6b1df 100644
--- a/arch/um/os-Linux/Makefile
+++ b/arch/um/os-Linux/Makefile
@@ -4,11 +4,13 @@
 #
 
 obj-y = aio.o elf_aux.o file.o helper.o main.o mem.o process.o signal.o \
-	start_up.o time.o tt.o tty.o uaccess.o umid.o user_syms.o drivers/ \
-	sys-$(SUBARCH)/
+	start_up.o time.o trap.o tt.o tty.o uaccess.o umid.o user_syms.o \
+	drivers/ sys-$(SUBARCH)/
+
+obj-$(CONFIG_MODE_SKAS) += skas/
 
 USER_OBJS := aio.o elf_aux.o file.o helper.o main.o mem.o process.o signal.o \
-	start_up.o time.o tt.o tty.o uaccess.o umid.o
+	start_up.o time.o trap.o tt.o tty.o uaccess.o umid.o
 
 elf_aux.o: $(ARCH_DIR)/kernel-offsets.h
 CFLAGS_elf_aux.o += -I$(objtree)/arch/um
diff --git a/arch/um/os-Linux/skas/Makefile b/arch/um/os-Linux/skas/Makefile
new file mode 100644
index 0000000000..eab5386d60
--- /dev/null
+++ b/arch/um/os-Linux/skas/Makefile
@@ -0,0 +1,10 @@
+#
+# Copyright (C) 2002 - 2004 Jeff Dike (jdike@addtoit.com)
+# Licensed under the GPL
+#
+
+obj-y := trap.o
+
+USER_OBJS := trap.o
+
+include arch/um/scripts/Makefile.rules
diff --git a/arch/um/os-Linux/skas/trap.c b/arch/um/os-Linux/skas/trap.c
new file mode 100644
index 0000000000..818f30e900
--- /dev/null
+++ b/arch/um/os-Linux/skas/trap.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright (C) 2002 - 2003 Jeff Dike (jdike@addtoit.com)
+ * Licensed under the GPL
+ */
+
+#include <signal.h>
+#include <errno.h>
+#include "user_util.h"
+#include "kern_util.h"
+#include "task.h"
+#include "sigcontext.h"
+#include "skas.h"
+#include "ptrace_user.h"
+#include "sysdep/ptrace.h"
+#include "sysdep/ptrace_user.h"
+#include "os.h"
+
+void sig_handler_common_skas(int sig, void *sc_ptr)
+{
+	struct sigcontext *sc = sc_ptr;
+	struct skas_regs *r;
+	void (*handler)(int, union uml_pt_regs *);
+	int save_errno = errno;
+	int save_user;
+
+	/* This is done because to allow SIGSEGV to be delivered inside a SEGV
+	 * handler.  This can happen in copy_user, and if SEGV is disabled,
+	 * the process will die.
+	 * XXX Figure out why this is better than SA_NODEFER
+	 */
+	if(sig == SIGSEGV)
+		change_sig(SIGSEGV, 1);
+
+	r = &TASK_REGS(get_current())->skas;
+	save_user = r->is_user;
+	r->is_user = 0;
+        if ( sig == SIGFPE || sig == SIGSEGV ||
+             sig == SIGBUS || sig == SIGILL ||
+             sig == SIGTRAP ) {
+                GET_FAULTINFO_FROM_SC(r->faultinfo, sc);
+        }
+
+	change_sig(SIGUSR1, 1);
+
+	handler = sig_info[sig];
+
+	/* unblock SIGALRM, SIGVTALRM, SIGIO if sig isn't IRQ signal */
+	if (sig != SIGIO && sig != SIGWINCH &&
+	    sig != SIGVTALRM && sig != SIGALRM)
+		unblock_signals();
+
+	handler(sig, (union uml_pt_regs *) r);
+
+	errno = save_errno;
+	r->is_user = save_user;
+}
+
+extern int ptrace_faultinfo;
+
+void user_signal(int sig, union uml_pt_regs *regs, int pid)
+{
+	void (*handler)(int, union uml_pt_regs *);
+        int segv = ((sig == SIGFPE) || (sig == SIGSEGV) || (sig == SIGBUS) ||
+                    (sig == SIGILL) || (sig == SIGTRAP));
+
+	if (segv)
+		get_skas_faultinfo(pid, &regs->skas.faultinfo);
+
+	handler = sig_info[sig];
+	handler(sig, (union uml_pt_regs *) regs);
+
+	unblock_signals();
+}
diff --git a/arch/um/os-Linux/trap.c b/arch/um/os-Linux/trap.c
new file mode 100644
index 0000000000..6e7841c23d
--- /dev/null
+++ b/arch/um/os-Linux/trap.c
@@ -0,0 +1,17 @@
+/*
+ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com)
+ * Licensed under the GPL
+ */
+
+#include <setjmp.h>
+#include <signal.h>
+#include "kern_util.h"
+#include "user_util.h"
+#include "os.h"
+
+void do_longjmp(void *b, int val)
+{
+	sigjmp_buf *buf = b;
+
+	siglongjmp(*buf, val);
+}
diff --git a/arch/um/os-Linux/tt.c b/arch/um/os-Linux/tt.c
index 37828e5b35..cb2648b79d 100644
--- a/arch/um/os-Linux/tt.c
+++ b/arch/um/os-Linux/tt.c
@@ -49,6 +49,20 @@ int protect_memory(unsigned long addr, unsigned long len, int r, int w, int x,
 	return(0);
 }
 
+void kill_child_dead(int pid)
+{
+	kill(pid, SIGKILL);
+	kill(pid, SIGCONT);
+	do {
+		int n;
+		CATCH_EINTR(n = waitpid(pid, NULL, 0));
+		if (n > 0)
+			kill(pid, SIGCONT);
+		else
+			break;
+	} while(1);
+}
+
 /*
  *-------------------------
  * only for tt mode (will be deleted in future...)
-- 
cgit v1.2.2


From c66fdd5e324392584c6f11de65cfe24b0e2d9303 Mon Sep 17 00:00:00 2001
From: Gennady Sharapov <Gennady.V.Sharapov@intel.com>
Date: Sun, 8 Jan 2006 01:01:32 -0800
Subject: [PATCH] uml: merge trap_user.c and trap_kern.c

The serial UML OS-abstraction layer patch (um/kernel dir).

This joins trap_user.c and trap_kernel.c files.

Signed-off-by: Gennady Sharapov <Gennady.V.Sharapov@intel.com>
Signed-off-by: Jeff Dike <jdike@addtoit.com>
Cc: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/kernel/Makefile    |  2 +-
 arch/um/kernel/trap_kern.c | 14 +++++++++++
 arch/um/kernel/trap_user.c | 58 ----------------------------------------------
 arch/um/os-Linux/trap.c    | 25 +++++++++++++++++++-
 4 files changed, 39 insertions(+), 60 deletions(-)
 delete mode 100644 arch/um/kernel/trap_user.c

diff --git a/arch/um/kernel/Makefile b/arch/um/kernel/Makefile
index 9ce6c57747..193cc2b744 100644
--- a/arch/um/kernel/Makefile
+++ b/arch/um/kernel/Makefile
@@ -10,7 +10,7 @@ obj-y = config.o exec_kern.o exitcode.o \
 	init_task.o irq.o irq_user.o ksyms.o mem.o physmem.o \
 	process_kern.o ptrace.o reboot.o resource.o sigio_user.o sigio_kern.o \
 	signal_kern.o smp.o syscall_kern.o sysrq.o time.o \
-	time_kern.o tlb.o trap_kern.o trap_user.o uaccess.o um_arch.o umid.o \
+	time_kern.o tlb.o trap_kern.o uaccess.o um_arch.o umid.o \
 	user_util.o
 
 obj-$(CONFIG_BLK_DEV_INITRD) += initrd.o
diff --git a/arch/um/kernel/trap_kern.c b/arch/um/kernel/trap_kern.c
index b79f805bdc..d56046c2ab 100644
--- a/arch/um/kernel/trap_kern.c
+++ b/arch/um/kernel/trap_kern.c
@@ -26,6 +26,9 @@
 #include "mconsole_kern.h"
 #include "mem.h"
 #include "mem_kern.h"
+#include "sysdep/sigcontext.h"
+#include "sysdep/ptrace.h"
+#include "os.h"
 #ifdef CONFIG_MODE_SKAS
 #include "skas.h"
 #endif
@@ -126,6 +129,17 @@ out_of_memory:
 	goto out;
 }
 
+void segv_handler(int sig, union uml_pt_regs *regs)
+{
+	struct faultinfo * fi = UPT_FAULTINFO(regs);
+
+	if(UPT_IS_USER(regs) && !SEGV_IS_FIXABLE(fi)){
+		bad_segv(*fi, UPT_IP(regs));
+		return;
+	}
+	segv(*fi, UPT_IP(regs), UPT_IS_USER(regs), regs);
+}
+
 struct kern_handlers handlinfo_kern = {
 	.relay_signal = relay_signal,
 	.winch = winch,
diff --git a/arch/um/kernel/trap_user.c b/arch/um/kernel/trap_user.c
deleted file mode 100644
index 5590b5739d..0000000000
--- a/arch/um/kernel/trap_user.c
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com)
- * Licensed under the GPL
- */
-
-#include <stdlib.h>
-#include <errno.h>
-#include <setjmp.h>
-#include <signal.h>
-#include <sys/time.h>
-#include <sys/wait.h>
-#include <asm/page.h>
-#include <asm/unistd.h>
-#include <asm/ptrace.h>
-#include "init.h"
-#include "sysdep/ptrace.h"
-#include "sigcontext.h"
-#include "sysdep/sigcontext.h"
-#include "irq_user.h"
-#include "time_user.h"
-#include "task.h"
-#include "mode.h"
-#include "choose-mode.h"
-#include "kern_util.h"
-#include "user_util.h"
-#include "os.h"
-
-void segv_handler(int sig, union uml_pt_regs *regs)
-{
-        struct faultinfo * fi = UPT_FAULTINFO(regs);
-
-        if(UPT_IS_USER(regs) && !SEGV_IS_FIXABLE(fi)){
-                bad_segv(*fi, UPT_IP(regs));
-		return;
-	}
-        segv(*fi, UPT_IP(regs), UPT_IS_USER(regs), regs);
-}
-
-void usr2_handler(int sig, union uml_pt_regs *regs)
-{
-	CHOOSE_MODE(syscall_handler_tt(sig, regs), (void) 0);
-}
-
-void (*sig_info[NSIG])(int, union uml_pt_regs *);
-
-void os_fill_handlinfo(struct kern_handlers h)
-{
-	sig_info[SIGTRAP] = h.relay_signal;
-	sig_info[SIGFPE] = h.relay_signal;
-	sig_info[SIGILL] = h.relay_signal;
-	sig_info[SIGWINCH] = h.winch;
-	sig_info[SIGBUS] = h.bus_handler;
-	sig_info[SIGSEGV] = h.page_fault;
-	sig_info[SIGIO] = h.sigio_handler;
-	sig_info[SIGVTALRM] = h.timer_handler;
-	sig_info[SIGALRM] = h.timer_handler;
-	sig_info[SIGUSR2] = usr2_handler;
-}
diff --git a/arch/um/os-Linux/trap.c b/arch/um/os-Linux/trap.c
index 6e7841c23d..321e1c8e22 100644
--- a/arch/um/os-Linux/trap.c
+++ b/arch/um/os-Linux/trap.c
@@ -3,11 +3,34 @@
  * Licensed under the GPL
  */
 
-#include <setjmp.h>
+#include <stdlib.h>
 #include <signal.h>
+#include <setjmp.h>
 #include "kern_util.h"
 #include "user_util.h"
 #include "os.h"
+#include "mode.h"
+
+void usr2_handler(int sig, union uml_pt_regs *regs)
+{
+	CHOOSE_MODE(syscall_handler_tt(sig, regs), (void) 0);
+}
+
+void (*sig_info[NSIG])(int, union uml_pt_regs *);
+
+void os_fill_handlinfo(struct kern_handlers h)
+{
+	sig_info[SIGTRAP] = h.relay_signal;
+	sig_info[SIGFPE] = h.relay_signal;
+	sig_info[SIGILL] = h.relay_signal;
+	sig_info[SIGWINCH] = h.winch;
+	sig_info[SIGBUS] = h.bus_handler;
+	sig_info[SIGSEGV] = h.page_fault;
+	sig_info[SIGIO] = h.sigio_handler;
+	sig_info[SIGVTALRM] = h.timer_handler;
+	sig_info[SIGALRM] = h.timer_handler;
+	sig_info[SIGUSR2] = usr2_handler;
+}
 
 void do_longjmp(void *b, int val)
 {
-- 
cgit v1.2.2


From f8aaeacec159f2d9003872781fa4d49659e347fb Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Sun, 8 Jan 2006 01:01:32 -0800
Subject: [PATCH] consolidate asm/futex.h

Most of the architectures have the same asm/futex.h.  This consolidates them
into asm-generic, with the arches including it from their own asm/futex.h.

In the case of UML, this reverts the old broken futex.h and goes back to using
the same one as almost everyone else.

Signed-off-by: Jeff Dike <jdike@addtoit.com>
Cc: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-alpha/futex.h     | 49 +--------------------------------------
 include/asm-arm/futex.h       | 49 +--------------------------------------
 include/asm-arm26/futex.h     | 49 +--------------------------------------
 include/asm-cris/futex.h      | 49 +--------------------------------------
 include/asm-generic/futex.h   | 53 +++++++++++++++++++++++++++++++++++++++++++
 include/asm-h8300/futex.h     | 49 +--------------------------------------
 include/asm-ia64/futex.h      | 49 +--------------------------------------
 include/asm-m32r/futex.h      | 49 +--------------------------------------
 include/asm-m68k/futex.h      | 49 +--------------------------------------
 include/asm-m68knommu/futex.h | 49 +--------------------------------------
 include/asm-parisc/futex.h    | 49 +--------------------------------------
 include/asm-s390/futex.h      | 49 +--------------------------------------
 include/asm-sh/futex.h        | 49 +--------------------------------------
 include/asm-sh64/futex.h      | 49 +--------------------------------------
 include/asm-sparc/futex.h     | 49 +--------------------------------------
 include/asm-sparc64/futex.h   | 49 +--------------------------------------
 include/asm-um/futex.h        | 12 +++-------
 include/asm-v850/futex.h      | 49 +--------------------------------------
 18 files changed, 72 insertions(+), 777 deletions(-)
 create mode 100644 include/asm-generic/futex.h

diff --git a/include/asm-alpha/futex.h b/include/asm-alpha/futex.h
index 9feff4ce14..6a332a9f09 100644
--- a/include/asm-alpha/futex.h
+++ b/include/asm-alpha/futex.h
@@ -1,53 +1,6 @@
 #ifndef _ASM_FUTEX_H
 #define _ASM_FUTEX_H
 
-#ifdef __KERNEL__
+#include <asm-generic/futex.h>
 
-#include <linux/futex.h>
-#include <asm/errno.h>
-#include <asm/uaccess.h>
-
-static inline int
-futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
-{
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
-	int oldval = 0, ret;
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
-		return -EFAULT;
-
-	inc_preempt_count();
-
-	switch (op) {
-	case FUTEX_OP_SET:
-	case FUTEX_OP_ADD:
-	case FUTEX_OP_OR:
-	case FUTEX_OP_ANDN:
-	case FUTEX_OP_XOR:
-	default:
-		ret = -ENOSYS;
-	}
-
-	dec_preempt_count();
-
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
-	return ret;
-}
-
-#endif
 #endif
diff --git a/include/asm-arm/futex.h b/include/asm-arm/futex.h
index 9feff4ce14..6a332a9f09 100644
--- a/include/asm-arm/futex.h
+++ b/include/asm-arm/futex.h
@@ -1,53 +1,6 @@
 #ifndef _ASM_FUTEX_H
 #define _ASM_FUTEX_H
 
-#ifdef __KERNEL__
+#include <asm-generic/futex.h>
 
-#include <linux/futex.h>
-#include <asm/errno.h>
-#include <asm/uaccess.h>
-
-static inline int
-futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
-{
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
-	int oldval = 0, ret;
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
-		return -EFAULT;
-
-	inc_preempt_count();
-
-	switch (op) {
-	case FUTEX_OP_SET:
-	case FUTEX_OP_ADD:
-	case FUTEX_OP_OR:
-	case FUTEX_OP_ANDN:
-	case FUTEX_OP_XOR:
-	default:
-		ret = -ENOSYS;
-	}
-
-	dec_preempt_count();
-
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
-	return ret;
-}
-
-#endif
 #endif
diff --git a/include/asm-arm26/futex.h b/include/asm-arm26/futex.h
index 9feff4ce14..6a332a9f09 100644
--- a/include/asm-arm26/futex.h
+++ b/include/asm-arm26/futex.h
@@ -1,53 +1,6 @@
 #ifndef _ASM_FUTEX_H
 #define _ASM_FUTEX_H
 
-#ifdef __KERNEL__
+#include <asm-generic/futex.h>
 
-#include <linux/futex.h>
-#include <asm/errno.h>
-#include <asm/uaccess.h>
-
-static inline int
-futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
-{
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
-	int oldval = 0, ret;
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
-		return -EFAULT;
-
-	inc_preempt_count();
-
-	switch (op) {
-	case FUTEX_OP_SET:
-	case FUTEX_OP_ADD:
-	case FUTEX_OP_OR:
-	case FUTEX_OP_ANDN:
-	case FUTEX_OP_XOR:
-	default:
-		ret = -ENOSYS;
-	}
-
-	dec_preempt_count();
-
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
-	return ret;
-}
-
-#endif
 #endif
diff --git a/include/asm-cris/futex.h b/include/asm-cris/futex.h
index 9feff4ce14..6a332a9f09 100644
--- a/include/asm-cris/futex.h
+++ b/include/asm-cris/futex.h
@@ -1,53 +1,6 @@
 #ifndef _ASM_FUTEX_H
 #define _ASM_FUTEX_H
 
-#ifdef __KERNEL__
+#include <asm-generic/futex.h>
 
-#include <linux/futex.h>
-#include <asm/errno.h>
-#include <asm/uaccess.h>
-
-static inline int
-futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
-{
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
-	int oldval = 0, ret;
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
-		return -EFAULT;
-
-	inc_preempt_count();
-
-	switch (op) {
-	case FUTEX_OP_SET:
-	case FUTEX_OP_ADD:
-	case FUTEX_OP_OR:
-	case FUTEX_OP_ANDN:
-	case FUTEX_OP_XOR:
-	default:
-		ret = -ENOSYS;
-	}
-
-	dec_preempt_count();
-
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
-	return ret;
-}
-
-#endif
 #endif
diff --git a/include/asm-generic/futex.h b/include/asm-generic/futex.h
new file mode 100644
index 0000000000..3ae2c73475
--- /dev/null
+++ b/include/asm-generic/futex.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_GENERIC_FUTEX_H
+#define _ASM_GENERIC_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+static inline int
+futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+{
+	int op = (encoded_op >> 28) & 7;
+	int cmp = (encoded_op >> 24) & 15;
+	int oparg = (encoded_op << 8) >> 20;
+	int cmparg = (encoded_op << 20) >> 20;
+	int oldval = 0, ret;
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+		oparg = 1 << oparg;
+
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	inc_preempt_count();
+
+	switch (op) {
+	case FUTEX_OP_SET:
+	case FUTEX_OP_ADD:
+	case FUTEX_OP_OR:
+	case FUTEX_OP_ANDN:
+	case FUTEX_OP_XOR:
+	default:
+		ret = -ENOSYS;
+	}
+
+	dec_preempt_count();
+
+	if (!ret) {
+		switch (cmp) {
+		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+		default: ret = -ENOSYS;
+		}
+	}
+	return ret;
+}
+
+#endif
+#endif
diff --git a/include/asm-h8300/futex.h b/include/asm-h8300/futex.h
index 9feff4ce14..6a332a9f09 100644
--- a/include/asm-h8300/futex.h
+++ b/include/asm-h8300/futex.h
@@ -1,53 +1,6 @@
 #ifndef _ASM_FUTEX_H
 #define _ASM_FUTEX_H
 
-#ifdef __KERNEL__
+#include <asm-generic/futex.h>
 
-#include <linux/futex.h>
-#include <asm/errno.h>
-#include <asm/uaccess.h>
-
-static inline int
-futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
-{
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
-	int oldval = 0, ret;
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
-		return -EFAULT;
-
-	inc_preempt_count();
-
-	switch (op) {
-	case FUTEX_OP_SET:
-	case FUTEX_OP_ADD:
-	case FUTEX_OP_OR:
-	case FUTEX_OP_ANDN:
-	case FUTEX_OP_XOR:
-	default:
-		ret = -ENOSYS;
-	}
-
-	dec_preempt_count();
-
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
-	return ret;
-}
-
-#endif
 #endif
diff --git a/include/asm-ia64/futex.h b/include/asm-ia64/futex.h
index 9feff4ce14..6a332a9f09 100644
--- a/include/asm-ia64/futex.h
+++ b/include/asm-ia64/futex.h
@@ -1,53 +1,6 @@
 #ifndef _ASM_FUTEX_H
 #define _ASM_FUTEX_H
 
-#ifdef __KERNEL__
+#include <asm-generic/futex.h>
 
-#include <linux/futex.h>
-#include <asm/errno.h>
-#include <asm/uaccess.h>
-
-static inline int
-futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
-{
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
-	int oldval = 0, ret;
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
-		return -EFAULT;
-
-	inc_preempt_count();
-
-	switch (op) {
-	case FUTEX_OP_SET:
-	case FUTEX_OP_ADD:
-	case FUTEX_OP_OR:
-	case FUTEX_OP_ANDN:
-	case FUTEX_OP_XOR:
-	default:
-		ret = -ENOSYS;
-	}
-
-	dec_preempt_count();
-
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
-	return ret;
-}
-
-#endif
 #endif
diff --git a/include/asm-m32r/futex.h b/include/asm-m32r/futex.h
index 9feff4ce14..6a332a9f09 100644
--- a/include/asm-m32r/futex.h
+++ b/include/asm-m32r/futex.h
@@ -1,53 +1,6 @@
 #ifndef _ASM_FUTEX_H
 #define _ASM_FUTEX_H
 
-#ifdef __KERNEL__
+#include <asm-generic/futex.h>
 
-#include <linux/futex.h>
-#include <asm/errno.h>
-#include <asm/uaccess.h>
-
-static inline int
-futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
-{
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
-	int oldval = 0, ret;
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
-		return -EFAULT;
-
-	inc_preempt_count();
-
-	switch (op) {
-	case FUTEX_OP_SET:
-	case FUTEX_OP_ADD:
-	case FUTEX_OP_OR:
-	case FUTEX_OP_ANDN:
-	case FUTEX_OP_XOR:
-	default:
-		ret = -ENOSYS;
-	}
-
-	dec_preempt_count();
-
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
-	return ret;
-}
-
-#endif
 #endif
diff --git a/include/asm-m68k/futex.h b/include/asm-m68k/futex.h
index 9feff4ce14..6a332a9f09 100644
--- a/include/asm-m68k/futex.h
+++ b/include/asm-m68k/futex.h
@@ -1,53 +1,6 @@
 #ifndef _ASM_FUTEX_H
 #define _ASM_FUTEX_H
 
-#ifdef __KERNEL__
+#include <asm-generic/futex.h>
 
-#include <linux/futex.h>
-#include <asm/errno.h>
-#include <asm/uaccess.h>
-
-static inline int
-futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
-{
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
-	int oldval = 0, ret;
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
-		return -EFAULT;
-
-	inc_preempt_count();
-
-	switch (op) {
-	case FUTEX_OP_SET:
-	case FUTEX_OP_ADD:
-	case FUTEX_OP_OR:
-	case FUTEX_OP_ANDN:
-	case FUTEX_OP_XOR:
-	default:
-		ret = -ENOSYS;
-	}
-
-	dec_preempt_count();
-
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
-	return ret;
-}
-
-#endif
 #endif
diff --git a/include/asm-m68knommu/futex.h b/include/asm-m68knommu/futex.h
index 9feff4ce14..6a332a9f09 100644
--- a/include/asm-m68knommu/futex.h
+++ b/include/asm-m68knommu/futex.h
@@ -1,53 +1,6 @@
 #ifndef _ASM_FUTEX_H
 #define _ASM_FUTEX_H
 
-#ifdef __KERNEL__
+#include <asm-generic/futex.h>
 
-#include <linux/futex.h>
-#include <asm/errno.h>
-#include <asm/uaccess.h>
-
-static inline int
-futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
-{
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
-	int oldval = 0, ret;
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
-		return -EFAULT;
-
-	inc_preempt_count();
-
-	switch (op) {
-	case FUTEX_OP_SET:
-	case FUTEX_OP_ADD:
-	case FUTEX_OP_OR:
-	case FUTEX_OP_ANDN:
-	case FUTEX_OP_XOR:
-	default:
-		ret = -ENOSYS;
-	}
-
-	dec_preempt_count();
-
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
-	return ret;
-}
-
-#endif
 #endif
diff --git a/include/asm-parisc/futex.h b/include/asm-parisc/futex.h
index 9feff4ce14..6a332a9f09 100644
--- a/include/asm-parisc/futex.h
+++ b/include/asm-parisc/futex.h
@@ -1,53 +1,6 @@
 #ifndef _ASM_FUTEX_H
 #define _ASM_FUTEX_H
 
-#ifdef __KERNEL__
+#include <asm-generic/futex.h>
 
-#include <linux/futex.h>
-#include <asm/errno.h>
-#include <asm/uaccess.h>
-
-static inline int
-futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
-{
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
-	int oldval = 0, ret;
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
-		return -EFAULT;
-
-	inc_preempt_count();
-
-	switch (op) {
-	case FUTEX_OP_SET:
-	case FUTEX_OP_ADD:
-	case FUTEX_OP_OR:
-	case FUTEX_OP_ANDN:
-	case FUTEX_OP_XOR:
-	default:
-		ret = -ENOSYS;
-	}
-
-	dec_preempt_count();
-
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
-	return ret;
-}
-
-#endif
 #endif
diff --git a/include/asm-s390/futex.h b/include/asm-s390/futex.h
index 9feff4ce14..6a332a9f09 100644
--- a/include/asm-s390/futex.h
+++ b/include/asm-s390/futex.h
@@ -1,53 +1,6 @@
 #ifndef _ASM_FUTEX_H
 #define _ASM_FUTEX_H
 
-#ifdef __KERNEL__
+#include <asm-generic/futex.h>
 
-#include <linux/futex.h>
-#include <asm/errno.h>
-#include <asm/uaccess.h>
-
-static inline int
-futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
-{
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
-	int oldval = 0, ret;
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
-		return -EFAULT;
-
-	inc_preempt_count();
-
-	switch (op) {
-	case FUTEX_OP_SET:
-	case FUTEX_OP_ADD:
-	case FUTEX_OP_OR:
-	case FUTEX_OP_ANDN:
-	case FUTEX_OP_XOR:
-	default:
-		ret = -ENOSYS;
-	}
-
-	dec_preempt_count();
-
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
-	return ret;
-}
-
-#endif
 #endif
diff --git a/include/asm-sh/futex.h b/include/asm-sh/futex.h
index 9feff4ce14..6a332a9f09 100644
--- a/include/asm-sh/futex.h
+++ b/include/asm-sh/futex.h
@@ -1,53 +1,6 @@
 #ifndef _ASM_FUTEX_H
 #define _ASM_FUTEX_H
 
-#ifdef __KERNEL__
+#include <asm-generic/futex.h>
 
-#include <linux/futex.h>
-#include <asm/errno.h>
-#include <asm/uaccess.h>
-
-static inline int
-futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
-{
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
-	int oldval = 0, ret;
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
-		return -EFAULT;
-
-	inc_preempt_count();
-
-	switch (op) {
-	case FUTEX_OP_SET:
-	case FUTEX_OP_ADD:
-	case FUTEX_OP_OR:
-	case FUTEX_OP_ANDN:
-	case FUTEX_OP_XOR:
-	default:
-		ret = -ENOSYS;
-	}
-
-	dec_preempt_count();
-
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
-	return ret;
-}
-
-#endif
 #endif
diff --git a/include/asm-sh64/futex.h b/include/asm-sh64/futex.h
index 9feff4ce14..6a332a9f09 100644
--- a/include/asm-sh64/futex.h
+++ b/include/asm-sh64/futex.h
@@ -1,53 +1,6 @@
 #ifndef _ASM_FUTEX_H
 #define _ASM_FUTEX_H
 
-#ifdef __KERNEL__
+#include <asm-generic/futex.h>
 
-#include <linux/futex.h>
-#include <asm/errno.h>
-#include <asm/uaccess.h>
-
-static inline int
-futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
-{
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
-	int oldval = 0, ret;
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
-		return -EFAULT;
-
-	inc_preempt_count();
-
-	switch (op) {
-	case FUTEX_OP_SET:
-	case FUTEX_OP_ADD:
-	case FUTEX_OP_OR:
-	case FUTEX_OP_ANDN:
-	case FUTEX_OP_XOR:
-	default:
-		ret = -ENOSYS;
-	}
-
-	dec_preempt_count();
-
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
-	return ret;
-}
-
-#endif
 #endif
diff --git a/include/asm-sparc/futex.h b/include/asm-sparc/futex.h
index 9feff4ce14..6a332a9f09 100644
--- a/include/asm-sparc/futex.h
+++ b/include/asm-sparc/futex.h
@@ -1,53 +1,6 @@
 #ifndef _ASM_FUTEX_H
 #define _ASM_FUTEX_H
 
-#ifdef __KERNEL__
+#include <asm-generic/futex.h>
 
-#include <linux/futex.h>
-#include <asm/errno.h>
-#include <asm/uaccess.h>
-
-static inline int
-futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
-{
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
-	int oldval = 0, ret;
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
-		return -EFAULT;
-
-	inc_preempt_count();
-
-	switch (op) {
-	case FUTEX_OP_SET:
-	case FUTEX_OP_ADD:
-	case FUTEX_OP_OR:
-	case FUTEX_OP_ANDN:
-	case FUTEX_OP_XOR:
-	default:
-		ret = -ENOSYS;
-	}
-
-	dec_preempt_count();
-
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
-	return ret;
-}
-
-#endif
 #endif
diff --git a/include/asm-sparc64/futex.h b/include/asm-sparc64/futex.h
index 9feff4ce14..6a332a9f09 100644
--- a/include/asm-sparc64/futex.h
+++ b/include/asm-sparc64/futex.h
@@ -1,53 +1,6 @@
 #ifndef _ASM_FUTEX_H
 #define _ASM_FUTEX_H
 
-#ifdef __KERNEL__
+#include <asm-generic/futex.h>
 
-#include <linux/futex.h>
-#include <asm/errno.h>
-#include <asm/uaccess.h>
-
-static inline int
-futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
-{
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
-	int oldval = 0, ret;
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
-		return -EFAULT;
-
-	inc_preempt_count();
-
-	switch (op) {
-	case FUTEX_OP_SET:
-	case FUTEX_OP_ADD:
-	case FUTEX_OP_OR:
-	case FUTEX_OP_ANDN:
-	case FUTEX_OP_XOR:
-	default:
-		ret = -ENOSYS;
-	}
-
-	dec_preempt_count();
-
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
-	return ret;
-}
-
-#endif
 #endif
diff --git a/include/asm-um/futex.h b/include/asm-um/futex.h
index 142ee2d8e0..6a332a9f09 100644
--- a/include/asm-um/futex.h
+++ b/include/asm-um/futex.h
@@ -1,12 +1,6 @@
-#ifndef __UM_FUTEX_H
-#define __UM_FUTEX_H
+#ifndef _ASM_FUTEX_H
+#define _ASM_FUTEX_H
 
-#include <linux/futex.h>
-#include <asm/errno.h>
-#include <asm/system.h>
-#include <asm/processor.h>
-#include <asm/uaccess.h>
-
-#include "asm/arch/futex.h"
+#include <asm-generic/futex.h>
 
 #endif
diff --git a/include/asm-v850/futex.h b/include/asm-v850/futex.h
index 9feff4ce14..6a332a9f09 100644
--- a/include/asm-v850/futex.h
+++ b/include/asm-v850/futex.h
@@ -1,53 +1,6 @@
 #ifndef _ASM_FUTEX_H
 #define _ASM_FUTEX_H
 
-#ifdef __KERNEL__
+#include <asm-generic/futex.h>
 
-#include <linux/futex.h>
-#include <asm/errno.h>
-#include <asm/uaccess.h>
-
-static inline int
-futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
-{
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
-	int oldval = 0, ret;
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
-		return -EFAULT;
-
-	inc_preempt_count();
-
-	switch (op) {
-	case FUTEX_OP_SET:
-	case FUTEX_OP_ADD:
-	case FUTEX_OP_OR:
-	case FUTEX_OP_ANDN:
-	case FUTEX_OP_XOR:
-	default:
-		ret = -ENOSYS;
-	}
-
-	dec_preempt_count();
-
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
-	return ret;
-}
-
-#endif
 #endif
-- 
cgit v1.2.2


From c0a689d05d8dc1580aeb87b2959e3de8c57f5f77 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Sun, 8 Jan 2006 01:01:33 -0800
Subject: [PATCH] uml: whitespace cleanup

This fixes some mangled whitespace added by the earlier trap_user.c patch.

Signed-off-by: Jeff Dike <jdike@addtoit.com>
Cc: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/os-Linux/skas/trap.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/um/os-Linux/skas/trap.c b/arch/um/os-Linux/skas/trap.c
index 818f30e900..9ad5fbec45 100644
--- a/arch/um/os-Linux/skas/trap.c
+++ b/arch/um/os-Linux/skas/trap.c
@@ -34,11 +34,11 @@ void sig_handler_common_skas(int sig, void *sc_ptr)
 	r = &TASK_REGS(get_current())->skas;
 	save_user = r->is_user;
 	r->is_user = 0;
-        if ( sig == SIGFPE || sig == SIGSEGV ||
-             sig == SIGBUS || sig == SIGILL ||
-             sig == SIGTRAP ) {
-                GET_FAULTINFO_FROM_SC(r->faultinfo, sc);
-        }
+	if ( sig == SIGFPE || sig == SIGSEGV ||
+	     sig == SIGBUS || sig == SIGILL ||
+	     sig == SIGTRAP ) {
+		GET_FAULTINFO_FROM_SC(r->faultinfo, sc);
+	}
 
 	change_sig(SIGUSR1, 1);
 
@@ -60,8 +60,8 @@ extern int ptrace_faultinfo;
 void user_signal(int sig, union uml_pt_regs *regs, int pid)
 {
 	void (*handler)(int, union uml_pt_regs *);
-        int segv = ((sig == SIGFPE) || (sig == SIGSEGV) || (sig == SIGBUS) ||
-                    (sig == SIGILL) || (sig == SIGTRAP));
+	int segv = ((sig == SIGFPE) || (sig == SIGSEGV) || (sig == SIGBUS) ||
+		    (sig == SIGILL) || (sig == SIGTRAP));
 
 	if (segv)
 		get_skas_faultinfo(pid, &regs->skas.faultinfo);
-- 
cgit v1.2.2


From eafbaa94691f6a1fa67c3b076caa3ce4e2920100 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Sun, 8 Jan 2006 01:01:34 -0800
Subject: [PATCH] uml: prevent MODE_SKAS=n and MODE_TT=n

If MODE_TT=n, MODE_SKAS must be y.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Acked-by: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/um/Kconfig b/arch/um/Kconfig
index 1eb21de9d1..cdaa2ab192 100644
--- a/arch/um/Kconfig
+++ b/arch/um/Kconfig
@@ -83,7 +83,7 @@ config KERNEL_HALF_GIGS
         of physical memory.
 
 config MODE_SKAS
-	bool "Separate Kernel Address Space support"
+	bool "Separate Kernel Address Space support" if MODE_TT
 	default y
 	help
 	This option controls whether skas (separate kernel address space)
-- 
cgit v1.2.2


From 4369ef3c3e9d3bd9b879580678778f558d481e90 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@us.ibm.com>
Date: Sun, 8 Jan 2006 01:01:35 -0800
Subject: [PATCH] Make RCU task_struct safe for oprofile

Applying RCU to the task structure broke oprofile, because
free_task_notify() can now be called from softirq.  This means that the
task_mortuary lock must be acquired with irq disabled in order to avoid
intermittent self-deadlock.  Since irq is now disabled, the critical
section within process_task_mortuary() has been restructured to be O(1) in
order to maximize scalability and minimize realtime latency degradation.

Kudos to Wu Fengguang for finding this problem!

CC: Wu Fengguang <wfg@mail.ustc.edu.cn>
Cc: Philippe Elie <phil.el@wanadoo.fr>
Cc: John Levon <levon@movementarian.org>
Signed-off-by: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/oprofile/buffer_sync.c | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/drivers/oprofile/buffer_sync.c b/drivers/oprofile/buffer_sync.c
index 531b073131..b2e8e49c86 100644
--- a/drivers/oprofile/buffer_sync.c
+++ b/drivers/oprofile/buffer_sync.c
@@ -43,13 +43,16 @@ static void process_task_mortuary(void);
  * list for processing. Only after two full buffer syncs
  * does the task eventually get freed, because by then
  * we are sure we will not reference it again.
+ * Can be invoked from softirq via RCU callback due to
+ * call_rcu() of the task struct, hence the _irqsave.
  */
 static int task_free_notify(struct notifier_block * self, unsigned long val, void * data)
 {
+	unsigned long flags;
 	struct task_struct * task = data;
-	spin_lock(&task_mortuary);
+	spin_lock_irqsave(&task_mortuary, flags);
 	list_add(&task->tasks, &dying_tasks);
-	spin_unlock(&task_mortuary);
+	spin_unlock_irqrestore(&task_mortuary, flags);
 	return NOTIFY_OK;
 }
 
@@ -431,25 +434,22 @@ static void increment_tail(struct oprofile_cpu_buffer * b)
  */
 static void process_task_mortuary(void)
 {
-	struct list_head * pos;
-	struct list_head * pos2;
+	unsigned long flags;
+	LIST_HEAD(local_dead_tasks);
 	struct task_struct * task;
+	struct task_struct * ttask;
 
-	spin_lock(&task_mortuary);
+	spin_lock_irqsave(&task_mortuary, flags);
 
-	list_for_each_safe(pos, pos2, &dead_tasks) {
-		task = list_entry(pos, struct task_struct, tasks);
-		list_del(&task->tasks);
-		free_task(task);
-	}
+	list_splice_init(&dead_tasks, &local_dead_tasks);
+	list_splice_init(&dying_tasks, &dead_tasks);
 
-	list_for_each_safe(pos, pos2, &dying_tasks) {
-		task = list_entry(pos, struct task_struct, tasks);
+	spin_unlock_irqrestore(&task_mortuary, flags);
+
+	list_for_each_entry_safe(task, ttask, &local_dead_tasks, tasks) {
 		list_del(&task->tasks);
-		list_add_tail(&task->tasks, &dead_tasks);
+		free_task(task);
 	}
-
-	spin_unlock(&task_mortuary);
 }
 
 
-- 
cgit v1.2.2


From e56d090310d7625ecb43a1eeebd479f04affb48b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 8 Jan 2006 01:01:37 -0800
Subject: [PATCH] RCU signal handling

RCU tasklist_lock and RCU signal handling: send signals RCU-read-locked
instead of tasklist_lock read-locked.  This is a scalability improvement on
SMP and a preemption-latency improvement under PREEMPT_RCU.

Signed-off-by: Paul E. McKenney <paulmck@us.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: William Irwin <wli@holomorphy.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c             |  4 +--
 include/linux/sched.h | 32 +++++++++++++++--
 kernel/exit.c         |  1 -
 kernel/fork.c         | 10 +++++-
 kernel/pid.c          | 22 ++++++------
 kernel/rcupdate.c     |  1 +
 kernel/sched.c        |  7 ++++
 kernel/signal.c       | 97 +++++++++++++++++++++++++++++++++++++++++++--------
 8 files changed, 143 insertions(+), 31 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index e75a9548da..e9650cd22a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -760,7 +760,7 @@ no_thread_group:
 		spin_lock(&oldsighand->siglock);
 		spin_lock(&newsighand->siglock);
 
-		current->sighand = newsighand;
+		rcu_assign_pointer(current->sighand, newsighand);
 		recalc_sigpending();
 
 		spin_unlock(&newsighand->siglock);
@@ -768,7 +768,7 @@ no_thread_group:
 		write_unlock_irq(&tasklist_lock);
 
 		if (atomic_dec_and_test(&oldsighand->count))
-			kmem_cache_free(sighand_cachep, oldsighand);
+			sighand_free(oldsighand);
 	}
 
 	BUG_ON(!thread_group_leader(current));
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a74662077d..a6af77e9b4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -34,6 +34,7 @@
 #include <linux/percpu.h>
 #include <linux/topology.h>
 #include <linux/seccomp.h>
+#include <linux/rcupdate.h>
 
 #include <linux/auxvec.h>	/* For AT_VECTOR_SIZE */
 
@@ -350,8 +351,16 @@ struct sighand_struct {
 	atomic_t		count;
 	struct k_sigaction	action[_NSIG];
 	spinlock_t		siglock;
+	struct rcu_head		rcu;
 };
 
+extern void sighand_free_cb(struct rcu_head *rhp);
+
+static inline void sighand_free(struct sighand_struct *sp)
+{
+	call_rcu(&sp->rcu, sighand_free_cb);
+}
+
 /*
  * NOTE! "signal_struct" does not have it's own
  * locking, because a shared signal_struct always
@@ -844,6 +853,7 @@ struct task_struct {
 	int cpuset_mems_generation;
 #endif
 	atomic_t fs_excl;	/* holding fs exclusive resources */
+	struct rcu_head rcu;
 };
 
 static inline pid_t process_group(struct task_struct *tsk)
@@ -867,8 +877,26 @@ static inline int pid_alive(struct task_struct *p)
 extern void free_task(struct task_struct *tsk);
 extern void __put_task_struct(struct task_struct *tsk);
 #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
-#define put_task_struct(tsk) \
-do { if (atomic_dec_and_test(&(tsk)->usage)) __put_task_struct(tsk); } while(0)
+
+static inline int get_task_struct_rcu(struct task_struct *t)
+{
+	int oldusage;
+
+	do {
+		oldusage = atomic_read(&t->usage);
+		if (oldusage == 0)
+			return 0;
+	} while (cmpxchg(&t->usage.counter, oldusage, oldusage+1) != oldusage);
+	return 1;
+}
+
+extern void __put_task_struct_cb(struct rcu_head *rhp);
+
+static inline void put_task_struct(struct task_struct *t)
+{
+	if (atomic_dec_and_test(&t->usage))
+		call_rcu(&t->rcu, __put_task_struct_cb);
+}
 
 /*
  * Per process flags
diff --git a/kernel/exit.c b/kernel/exit.c
index ee515683b9..c73a7eb26d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -72,7 +72,6 @@ repeat:
 		__ptrace_unlink(p);
 	BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
 	__exit_signal(p);
-	__exit_sighand(p);
 	/*
 	 * Note that the fastpath in sys_times depends on __exit_signal having
 	 * updated the counters before a task is removed from the tasklist of
diff --git a/kernel/fork.c b/kernel/fork.c
index fb8572a422..7fe3adfa65 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -743,6 +743,14 @@ int unshare_files(void)
 
 EXPORT_SYMBOL(unshare_files);
 
+void sighand_free_cb(struct rcu_head *rhp)
+{
+	struct sighand_struct *sp;
+
+	sp = container_of(rhp, struct sighand_struct, rcu);
+	kmem_cache_free(sighand_cachep, sp);
+}
+
 static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
 {
 	struct sighand_struct *sig;
@@ -752,7 +760,7 @@ static inline int copy_sighand(unsigned long clone_flags, struct task_struct * t
 		return 0;
 	}
 	sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
-	tsk->sighand = sig;
+	rcu_assign_pointer(tsk->sighand, sig);
 	if (!sig)
 		return -ENOMEM;
 	spin_lock_init(&sig->siglock);
diff --git a/kernel/pid.c b/kernel/pid.c
index edba31c681..1acc072469 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -136,7 +136,7 @@ struct pid * fastcall find_pid(enum pid_type type, int nr)
 	struct hlist_node *elem;
 	struct pid *pid;
 
-	hlist_for_each_entry(pid, elem,
+	hlist_for_each_entry_rcu(pid, elem,
 			&pid_hash[type][pid_hashfn(nr)], pid_chain) {
 		if (pid->nr == nr)
 			return pid;
@@ -150,15 +150,15 @@ int fastcall attach_pid(task_t *task, enum pid_type type, int nr)
 
 	task_pid = &task->pids[type];
 	pid = find_pid(type, nr);
+	task_pid->nr = nr;
 	if (pid == NULL) {
-		hlist_add_head(&task_pid->pid_chain,
-				&pid_hash[type][pid_hashfn(nr)]);
 		INIT_LIST_HEAD(&task_pid->pid_list);
+		hlist_add_head_rcu(&task_pid->pid_chain,
+				   &pid_hash[type][pid_hashfn(nr)]);
 	} else {
 		INIT_HLIST_NODE(&task_pid->pid_chain);
-		list_add_tail(&task_pid->pid_list, &pid->pid_list);
+		list_add_tail_rcu(&task_pid->pid_list, &pid->pid_list);
 	}
-	task_pid->nr = nr;
 
 	return 0;
 }
@@ -170,20 +170,20 @@ static fastcall int __detach_pid(task_t *task, enum pid_type type)
 
 	pid = &task->pids[type];
 	if (!hlist_unhashed(&pid->pid_chain)) {
-		hlist_del(&pid->pid_chain);
 
-		if (list_empty(&pid->pid_list))
+		if (list_empty(&pid->pid_list)) {
 			nr = pid->nr;
-		else {
+			hlist_del_rcu(&pid->pid_chain);
+		} else {
 			pid_next = list_entry(pid->pid_list.next,
 						struct pid, pid_list);
 			/* insert next pid from pid_list to hash */
-			hlist_add_head(&pid_next->pid_chain,
-				&pid_hash[type][pid_hashfn(pid_next->nr)]);
+			hlist_replace_rcu(&pid->pid_chain,
+					  &pid_next->pid_chain);
 		}
 	}
 
-	list_del(&pid->pid_list);
+	list_del_rcu(&pid->pid_list);
 	pid->nr = 0;
 
 	return nr;
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index c9afc61240..0a669bd2f6 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -35,6 +35,7 @@
 #include <linux/init.h>
 #include <linux/spinlock.h>
 #include <linux/smp.h>
+#include <linux/rcupdate.h>
 #include <linux/interrupt.h>
 #include <linux/sched.h>
 #include <asm/atomic.h>
diff --git a/kernel/sched.c b/kernel/sched.c
index 6f46c94cc2..9273309115 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -176,6 +176,13 @@ static unsigned int task_timeslice(task_t *p)
 #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran)	\
 				< (long long) (sd)->cache_hot_time)
 
+void __put_task_struct_cb(struct rcu_head *rhp)
+{
+	__put_task_struct(container_of(rhp, struct task_struct, rcu));
+}
+
+EXPORT_SYMBOL_GPL(__put_task_struct_cb);
+
 /*
  * These are the runqueue data structures:
  */
diff --git a/kernel/signal.c b/kernel/signal.c
index d7611f189e..64737c72da 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -329,13 +329,20 @@ void __exit_sighand(struct task_struct *tsk)
 	/* Ok, we're done with the signal handlers */
 	tsk->sighand = NULL;
 	if (atomic_dec_and_test(&sighand->count))
-		kmem_cache_free(sighand_cachep, sighand);
+		sighand_free(sighand);
 }
 
 void exit_sighand(struct task_struct *tsk)
 {
 	write_lock_irq(&tasklist_lock);
-	__exit_sighand(tsk);
+	rcu_read_lock();
+	if (tsk->sighand != NULL) {
+		struct sighand_struct *sighand = rcu_dereference(tsk->sighand);
+		spin_lock(&sighand->siglock);
+		__exit_sighand(tsk);
+		spin_unlock(&sighand->siglock);
+	}
+	rcu_read_unlock();
 	write_unlock_irq(&tasklist_lock);
 }
 
@@ -345,12 +352,14 @@ void exit_sighand(struct task_struct *tsk)
 void __exit_signal(struct task_struct *tsk)
 {
 	struct signal_struct * sig = tsk->signal;
-	struct sighand_struct * sighand = tsk->sighand;
+	struct sighand_struct * sighand;
 
 	if (!sig)
 		BUG();
 	if (!atomic_read(&sig->count))
 		BUG();
+	rcu_read_lock();
+	sighand = rcu_dereference(tsk->sighand);
 	spin_lock(&sighand->siglock);
 	posix_cpu_timers_exit(tsk);
 	if (atomic_dec_and_test(&sig->count)) {
@@ -358,6 +367,7 @@ void __exit_signal(struct task_struct *tsk)
 		if (tsk == sig->curr_target)
 			sig->curr_target = next_thread(tsk);
 		tsk->signal = NULL;
+		__exit_sighand(tsk);
 		spin_unlock(&sighand->siglock);
 		flush_sigqueue(&sig->shared_pending);
 	} else {
@@ -389,9 +399,11 @@ void __exit_signal(struct task_struct *tsk)
 		sig->nvcsw += tsk->nvcsw;
 		sig->nivcsw += tsk->nivcsw;
 		sig->sched_time += tsk->sched_time;
+		__exit_sighand(tsk);
 		spin_unlock(&sighand->siglock);
 		sig = NULL;	/* Marker for below.  */
 	}
+	rcu_read_unlock();
 	clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
 	flush_sigqueue(&tsk->pending);
 	if (sig) {
@@ -1080,18 +1092,28 @@ void zap_other_threads(struct task_struct *p)
 }
 
 /*
- * Must be called with the tasklist_lock held for reading!
+ * Must be called under rcu_read_lock() or with tasklist_lock read-held.
  */
 int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 {
 	unsigned long flags;
+	struct sighand_struct *sp;
 	int ret;
 
+retry:
 	ret = check_kill_permission(sig, info, p);
-	if (!ret && sig && p->sighand) {
-		spin_lock_irqsave(&p->sighand->siglock, flags);
+	if (!ret && sig && (sp = p->sighand)) {
+		if (!get_task_struct_rcu(p))
+			return -ESRCH;
+		spin_lock_irqsave(&sp->siglock, flags);
+		if (p->sighand != sp) {
+			spin_unlock_irqrestore(&sp->siglock, flags);
+			put_task_struct(p);
+			goto retry;
+		}
 		ret = __group_send_sig_info(sig, info, p);
-		spin_unlock_irqrestore(&p->sighand->siglock, flags);
+		spin_unlock_irqrestore(&sp->siglock, flags);
+		put_task_struct(p);
 	}
 
 	return ret;
@@ -1136,14 +1158,21 @@ int
 kill_proc_info(int sig, struct siginfo *info, pid_t pid)
 {
 	int error;
+	int acquired_tasklist_lock = 0;
 	struct task_struct *p;
 
-	read_lock(&tasklist_lock);
+	rcu_read_lock();
+	if (unlikely(sig_kernel_stop(sig) || sig == SIGCONT)) {
+		read_lock(&tasklist_lock);
+		acquired_tasklist_lock = 1;
+	}
 	p = find_task_by_pid(pid);
 	error = -ESRCH;
 	if (p)
 		error = group_send_sig_info(sig, info, p);
-	read_unlock(&tasklist_lock);
+	if (unlikely(acquired_tasklist_lock))
+		read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 	return error;
 }
 
@@ -1355,16 +1384,54 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 {
 	unsigned long flags;
 	int ret = 0;
+	struct sighand_struct *sh;
 
 	BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
-	read_lock(&tasklist_lock);
+
+	/*
+	 * The rcu based delayed sighand destroy makes it possible to
+	 * run this without tasklist lock held. The task struct itself
+	 * cannot go away as create_timer did get_task_struct().
+	 *
+	 * We return -1, when the task is marked exiting, so
+	 * posix_timer_event can redirect it to the group leader
+	 */
+	rcu_read_lock();
 
 	if (unlikely(p->flags & PF_EXITING)) {
 		ret = -1;
 		goto out_err;
 	}
 
-	spin_lock_irqsave(&p->sighand->siglock, flags);
+retry:
+	sh = rcu_dereference(p->sighand);
+
+	spin_lock_irqsave(&sh->siglock, flags);
+	if (p->sighand != sh) {
+		/* We raced with exec() in a multithreaded process... */
+		spin_unlock_irqrestore(&sh->siglock, flags);
+		goto retry;
+	}
+
+	/*
+	 * We do the check here again to handle the following scenario:
+	 *
+	 * CPU 0		CPU 1
+	 * send_sigqueue
+	 * check PF_EXITING
+	 * interrupt		exit code running
+	 *			__exit_signal
+	 *			lock sighand->siglock
+	 *			unlock sighand->siglock
+	 * lock sh->siglock
+	 * add(tsk->pending) 	flush_sigqueue(tsk->pending)
+	 *
+	 */
+
+	if (unlikely(p->flags & PF_EXITING)) {
+		ret = -1;
+		goto out;
+	}
 
 	if (unlikely(!list_empty(&q->list))) {
 		/*
@@ -1388,9 +1455,9 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 		signal_wake_up(p, sig == SIGKILL);
 
 out:
-	spin_unlock_irqrestore(&p->sighand->siglock, flags);
+	spin_unlock_irqrestore(&sh->siglock, flags);
 out_err:
-	read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 
 	return ret;
 }
@@ -1402,7 +1469,9 @@ send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 	int ret = 0;
 
 	BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
+
 	read_lock(&tasklist_lock);
+	/* Since it_lock is held, p->sighand cannot be NULL. */
 	spin_lock_irqsave(&p->sighand->siglock, flags);
 	handle_stop_signal(sig, p);
 
@@ -1436,7 +1505,7 @@ send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 out:
 	spin_unlock_irqrestore(&p->sighand->siglock, flags);
 	read_unlock(&tasklist_lock);
-	return(ret);
+	return ret;
 }
 
 /*
-- 
cgit v1.2.2


From 2d89c929078588aa9b9c674ef03ee9aa816b59b8 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@us.ibm.com>
Date: Sun, 8 Jan 2006 01:01:38 -0800
Subject: [PATCH] Simpler signal-exit concurrency handling

Some simplification in checking signal delivery against concurrent exit.
Instead of using get_task_struct_rcu(), which increments the task_struct
reference count, check the reference count after acquiring sighand lock.

Signed-off-by: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index 64737c72da..9b6fda5e87 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1102,18 +1102,19 @@ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 
 retry:
 	ret = check_kill_permission(sig, info, p);
-	if (!ret && sig && (sp = p->sighand)) {
-		if (!get_task_struct_rcu(p))
-			return -ESRCH;
+	if (!ret && sig && (sp = rcu_dereference(p->sighand))) {
 		spin_lock_irqsave(&sp->siglock, flags);
 		if (p->sighand != sp) {
 			spin_unlock_irqrestore(&sp->siglock, flags);
-			put_task_struct(p);
 			goto retry;
 		}
+		if ((atomic_read(&sp->count) == 0) ||
+				(atomic_read(&p->usage) == 0)) {
+			spin_unlock_irqrestore(&sp->siglock, flags);
+			return -ESRCH;
+		}
 		ret = __group_send_sig_info(sig, info, p);
 		spin_unlock_irqrestore(&sp->siglock, flags);
-		put_task_struct(p);
 	}
 
 	return ret;
-- 
cgit v1.2.2


From d4829cd5b4bd1ea58ba1bebad44d562f4027c290 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@us.ibm.com>
Date: Sun, 8 Jan 2006 01:01:39 -0800
Subject: [PATCH] remove get_task_struct_rcu()

The latest set of signal-RCU patches does not use get_task_struct_rcu().
Attached is a patch that removes it.

Signed-off-by: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a6af77e9b4..20bd707491 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -878,18 +878,6 @@ extern void free_task(struct task_struct *tsk);
 extern void __put_task_struct(struct task_struct *tsk);
 #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
 
-static inline int get_task_struct_rcu(struct task_struct *t)
-{
-	int oldusage;
-
-	do {
-		oldusage = atomic_read(&t->usage);
-		if (oldusage == 0)
-			return 0;
-	} while (cmpxchg(&t->usage.counter, oldusage, oldusage+1) != oldusage);
-	return 1;
-}
-
 extern void __put_task_struct_cb(struct rcu_head *rhp);
 
 static inline void put_task_struct(struct task_struct *t)
-- 
cgit v1.2.2


From 6e954b9e90c3a7157c0c1457dd3919e2a1345d23 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sun, 8 Jan 2006 01:01:40 -0800
Subject: [PATCH] radix tree: code consolidation

Introduce helper any_tag_set() rather than repeat the same code sequence 4
times.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 lib/radix-tree.c | 57 ++++++++++++++++++++++++++------------------------------
 1 file changed, 26 insertions(+), 31 deletions(-)

diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 88511c3805..1403e2c8bb 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -151,6 +151,20 @@ static inline int tag_get(struct radix_tree_node *node, int tag, int offset)
 	return test_bit(offset, &node->tags[tag][0]);
 }
 
+/*
+ * Returns 1 if any slot in the node has this tag set.
+ * Otherwise returns 0.
+ */
+static inline int any_tag_set(struct radix_tree_node *node, int tag)
+{
+	int idx;
+	for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
+		if (node->tags[tag][idx])
+			return 1;
+	}
+	return 0;
+}
+
 /*
  *	Return the maximum key which can be store into a
  *	radix tree with height HEIGHT.
@@ -185,15 +199,9 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
 	 * into the newly-pushed top-level node(s)
 	 */
 	for (tag = 0; tag < RADIX_TREE_TAGS; tag++) {
-		int idx;
-
 		tags[tag] = 0;
-		for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
-			if (root->rnode->tags[tag][idx]) {
-				tags[tag] = 1;
-				break;
-			}
-		}
+		if (any_tag_set(root->rnode, tag))
+			tags[tag] = 1;
 	}
 
 	do {
@@ -427,13 +435,9 @@ void *radix_tree_tag_clear(struct radix_tree_root *root,
 		goto out;
 
 	do {
-		int idx;
-
 		tag_clear(pathp->node, tag, pathp->offset);
-		for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
-			if (pathp->node->tags[tag][idx])
-				goto out;
-		}
+		if (any_tag_set(pathp->node, tag))
+			goto out;
 		pathp--;
 	} while (pathp->node);
 out:
@@ -729,19 +733,14 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
 
 		nr_cleared_tags = RADIX_TREE_TAGS;
 		for (tag = 0; tag < RADIX_TREE_TAGS; tag++) {
-			int idx;
-
 			if (tags[tag])
 				continue;
 
 			tag_clear(pathp->node, tag, pathp->offset);
 
-			for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
-				if (pathp->node->tags[tag][idx]) {
-					tags[tag] = 1;
-					nr_cleared_tags--;
-					break;
-				}
+			if (any_tag_set(pathp->node, tag)) {
+				tags[tag] = 1;
+				nr_cleared_tags--;
 			}
 		}
 		pathp--;
@@ -770,15 +769,11 @@ EXPORT_SYMBOL(radix_tree_delete);
  */
 int radix_tree_tagged(struct radix_tree_root *root, int tag)
 {
-	int idx;
-
-	if (!root->rnode)
-		return 0;
-	for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
-		if (root->rnode->tags[tag][idx])
-			return 1;
-	}
-	return 0;
+  	struct radix_tree_node *rnode;
+  	rnode = root->rnode;
+  	if (!rnode)
+  		return 0;
+	return any_tag_set(rnode, tag);
 }
 EXPORT_SYMBOL(radix_tree_tagged);
 
-- 
cgit v1.2.2


From d5274261ea46f0aae93820fe36628249120d2f75 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sun, 8 Jan 2006 01:01:41 -0800
Subject: [PATCH] radix tree: early termination of tag clearing

Correctly determine the tags to be cleared in radix_tree_delete() so we
don't keep moving up the tree clearing tags that we don't need to.  For
example, if a tag is simply not set in the deleted item, nor anywhere up
the tree, radix_tree_delete() would attempt to clear it up the entire
height of the tree.

Also, tag_set() was made conditional so as not to dirty too many cachelines
high up in the radix tree.  Instead, put this logic into
radix_tree_tag_set().

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 lib/radix-tree.c | 40 +++++++++++++++++++++++-----------------
 1 file changed, 23 insertions(+), 17 deletions(-)

diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 1403e2c8bb..336852f235 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -137,18 +137,17 @@ out:
 
 static inline void tag_set(struct radix_tree_node *node, int tag, int offset)
 {
-	if (!test_bit(offset, &node->tags[tag][0]))
-		__set_bit(offset, &node->tags[tag][0]);
+	__set_bit(offset, node->tags[tag]);
 }
 
 static inline void tag_clear(struct radix_tree_node *node, int tag, int offset)
 {
-	__clear_bit(offset, &node->tags[tag][0]);
+	__clear_bit(offset, node->tags[tag]);
 }
 
 static inline int tag_get(struct radix_tree_node *node, int tag, int offset)
 {
-	return test_bit(offset, &node->tags[tag][0]);
+	return test_bit(offset, node->tags[tag]);
 }
 
 /*
@@ -375,7 +374,8 @@ void *radix_tree_tag_set(struct radix_tree_root *root,
 		int offset;
 
 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
-		tag_set(slot, tag, offset);
+		if (!tag_get(slot, tag, offset))
+			tag_set(slot, tag, offset);
 		slot = slot->slots[offset];
 		BUG_ON(slot == NULL);
 		shift -= RADIX_TREE_MAP_SHIFT;
@@ -435,6 +435,8 @@ void *radix_tree_tag_clear(struct radix_tree_root *root,
 		goto out;
 
 	do {
+		if (!tag_get(pathp->node, tag, pathp->offset))
+			goto out;
 		tag_clear(pathp->node, tag, pathp->offset);
 		if (any_tag_set(pathp->node, tag))
 			goto out;
@@ -695,6 +697,8 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
 	void *ret = NULL;
 	char tags[RADIX_TREE_TAGS];
 	int nr_cleared_tags;
+	int tag;
+	int offset;
 
 	height = root->height;
 	if (index > radix_tree_maxindex(height))
@@ -705,16 +709,14 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
 	slot = root->rnode;
 
 	for ( ; height > 0; height--) {
-		int offset;
-
 		if (slot == NULL)
 			goto out;
 
+		pathp++;
 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
-		pathp[1].offset = offset;
-		pathp[1].node = slot;
+		pathp->offset = offset;
+		pathp->node = slot;
 		slot = slot->slots[offset];
-		pathp++;
 		shift -= RADIX_TREE_MAP_SHIFT;
 	}
 
@@ -727,24 +729,28 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
 	/*
 	 * Clear all tags associated with the just-deleted item
 	 */
-	memset(tags, 0, sizeof(tags));
-	do {
-		int tag;
+	nr_cleared_tags = 0;
+	for (tag = 0; tag < RADIX_TREE_TAGS; tag++) {
+		if (tag_get(pathp->node, tag, pathp->offset)) {
+			tag_clear(pathp->node, tag, pathp->offset);
+			tags[tag] = 0;
+			nr_cleared_tags++;
+		} else
+			tags[tag] = 1;
+	}
 
-		nr_cleared_tags = RADIX_TREE_TAGS;
+	for (pathp--; nr_cleared_tags && pathp->node; pathp--) {
 		for (tag = 0; tag < RADIX_TREE_TAGS; tag++) {
 			if (tags[tag])
 				continue;
 
 			tag_clear(pathp->node, tag, pathp->offset);
-
 			if (any_tag_set(pathp->node, tag)) {
 				tags[tag] = 1;
 				nr_cleared_tags--;
 			}
 		}
-		pathp--;
-	} while (pathp->node && nr_cleared_tags);
+	}
 
 	/* Now free the nodes we do not need anymore */
 	for (pathp = orig_pathp; pathp->node; pathp--) {
-- 
cgit v1.2.2


From a5f51c966720fa519c6ce69b169107dbc5769cdf Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sun, 8 Jan 2006 01:01:41 -0800
Subject: [PATCH] radix-tree: reduce tree height upon partial truncation

Shrink the height of a radix tree when it is partially truncated - we only do
shrinkage of full truncation at present.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 lib/radix-tree.c | 46 ++++++++++++++++++++++++++++++++++++----------
 1 file changed, 36 insertions(+), 10 deletions(-)

diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 336852f235..c0bd4a9148 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -253,7 +253,7 @@ int radix_tree_insert(struct radix_tree_root *root,
 	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
 
 	offset = 0;			/* uninitialised var warning */
-	while (height > 0) {
+	do {
 		if (slot == NULL) {
 			/* Have to add a child node.  */
 			if (!(slot = radix_tree_node_alloc(root)))
@@ -271,18 +271,16 @@ int radix_tree_insert(struct radix_tree_root *root,
 		slot = node->slots[offset];
 		shift -= RADIX_TREE_MAP_SHIFT;
 		height--;
-	}
+	} while (height > 0);
 
 	if (slot != NULL)
 		return -EEXIST;
 
-	if (node) {
-		node->count++;
-		node->slots[offset] = item;
-		BUG_ON(tag_get(node, 0, offset));
-		BUG_ON(tag_get(node, 1, offset));
-	} else
-		root->rnode = item;
+	BUG_ON(!node);
+	node->count++;
+	node->slots[offset] = item;
+	BUG_ON(tag_get(node, 0, offset));
+	BUG_ON(tag_get(node, 1, offset));
 
 	return 0;
 }
@@ -679,6 +677,29 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
 }
 EXPORT_SYMBOL(radix_tree_gang_lookup_tag);
 
+/**
+ *	radix_tree_shrink    -    shrink height of a radix tree to minimal
+ *	@root		radix tree root
+ */
+static inline void radix_tree_shrink(struct radix_tree_root *root)
+{
+	/* try to shrink tree height */
+	while (root->height > 1 &&
+			root->rnode->count == 1 &&
+			root->rnode->slots[0]) {
+		struct radix_tree_node *to_free = root->rnode;
+
+		root->rnode = to_free->slots[0];
+		root->height--;
+		/* must only free zeroed nodes into the slab */
+		tag_clear(to_free, 0, 0);
+		tag_clear(to_free, 1, 0);
+		to_free->slots[0] = NULL;
+		to_free->count = 0;
+		radix_tree_node_free(to_free);
+	}
+}
+
 /**
  *	radix_tree_delete    -    delete an item from a radix tree
  *	@root:		radix tree root
@@ -755,8 +776,13 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
 	/* Now free the nodes we do not need anymore */
 	for (pathp = orig_pathp; pathp->node; pathp--) {
 		pathp->node->slots[pathp->offset] = NULL;
-		if (--pathp->node->count)
+		pathp->node->count--;
+
+		if (pathp->node->count) {
+			if (pathp->node == root->rnode)
+				radix_tree_shrink(root);
 			goto out;
+		}
 
 		/* Node with zero slots in use so free it */
 		radix_tree_node_free(pathp->node);
-- 
cgit v1.2.2


From 50dd26ba0947aa653f0e42897aad7a4adce4e620 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 8 Jan 2006 01:01:42 -0800
Subject: [PATCH] DEBUG_SLAB depends on SLAB

Make DEBUG_SLAB depend on SLAB.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 lib/Kconfig.debug | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 80598cfd72..c48260fb8f 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -79,7 +79,7 @@ config SCHEDSTATS
 
 config DEBUG_SLAB
 	bool "Debug memory allocations"
-	depends on DEBUG_KERNEL
+	depends on DEBUG_KERNEL && SLAB
 	help
 	  Say Y here to have the kernel do limited verification on memory
 	  allocation as well as poisoning memory on free to catch use of freed
-- 
cgit v1.2.2


From 30992c97ae9d01b17374fbfab76a869fb4bba500 Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Sun, 8 Jan 2006 01:01:43 -0800
Subject: [PATCH] slob: introduce mm/util.c for shared functions

Add mm/util.c for functions common between SLAB and SLOB.

Signed-off-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/Makefile |  2 +-
 mm/slab.c   | 37 -------------------------------------
 mm/util.c   | 39 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 40 insertions(+), 38 deletions(-)
 create mode 100644 mm/util.c

diff --git a/mm/Makefile b/mm/Makefile
index 2fa6d2ca9f..74c85ddc91 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -10,7 +10,7 @@ mmu-$(CONFIG_MMU)	:= fremap.o highmem.o madvise.o memory.o mincore.o \
 obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
 			   page_alloc.o page-writeback.o pdflush.o \
 			   readahead.o slab.o swap.o truncate.o vmscan.o \
-			   prio_tree.o $(mmu-y)
+			   prio_tree.o util.o $(mmu-y)
 
 obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o thrash.o
 obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
diff --git a/mm/slab.c b/mm/slab.c
index 76b092bd0b..1c46c63835 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3052,20 +3052,6 @@ void kmem_cache_free(kmem_cache_t *cachep, void *objp)
 }
 EXPORT_SYMBOL(kmem_cache_free);
 
-/**
- * kzalloc - allocate memory. The memory is set to zero.
- * @size: how many bytes of memory are required.
- * @flags: the type of memory to allocate.
- */
-void *kzalloc(size_t size, gfp_t flags)
-{
-	void *ret = kmalloc(size, flags);
-	if (ret)
-		memset(ret, 0, size);
-	return ret;
-}
-EXPORT_SYMBOL(kzalloc);
-
 /**
  * kfree - free previously allocated memory
  * @objp: pointer returned by kmalloc.
@@ -3659,26 +3645,3 @@ unsigned int ksize(const void *objp)
 
 	return obj_reallen(page_get_cache(virt_to_page(objp)));
 }
-
-
-/*
- * kstrdup - allocate space for and copy an existing string
- *
- * @s: the string to duplicate
- * @gfp: the GFP mask used in the kmalloc() call when allocating memory
- */
-char *kstrdup(const char *s, gfp_t gfp)
-{
-	size_t len;
-	char *buf;
-
-	if (!s)
-		return NULL;
-
-	len = strlen(s) + 1;
-	buf = kmalloc(len, gfp);
-	if (buf)
-		memcpy(buf, s, len);
-	return buf;
-}
-EXPORT_SYMBOL(kstrdup);
diff --git a/mm/util.c b/mm/util.c
new file mode 100644
index 0000000000..5f4bb59da6
--- /dev/null
+++ b/mm/util.c
@@ -0,0 +1,39 @@
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/module.h>
+
+/**
+ * kzalloc - allocate memory. The memory is set to zero.
+ * @size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate.
+ */
+void *kzalloc(size_t size, gfp_t flags)
+{
+	void *ret = kmalloc(size, flags);
+	if (ret)
+		memset(ret, 0, size);
+	return ret;
+}
+EXPORT_SYMBOL(kzalloc);
+
+/*
+ * kstrdup - allocate space for and copy an existing string
+ *
+ * @s: the string to duplicate
+ * @gfp: the GFP mask used in the kmalloc() call when allocating memory
+ */
+char *kstrdup(const char *s, gfp_t gfp)
+{
+	size_t len;
+	char *buf;
+
+	if (!s)
+		return NULL;
+
+	len = strlen(s) + 1;
+	buf = kmalloc(len, gfp);
+	if (buf)
+		memcpy(buf, s, len);
+	return buf;
+}
+EXPORT_SYMBOL(kstrdup);
-- 
cgit v1.2.2


From 10cef6029502915bdb3cf0821d425cf9dc30c817 Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Sun, 8 Jan 2006 01:01:45 -0800
Subject: [PATCH] slob: introduce the SLOB allocator

configurable replacement for slab allocator

This adds a CONFIG_SLAB option under CONFIG_EMBEDDED.  When CONFIG_SLAB is
disabled, the kernel falls back to using the 'SLOB' allocator.

SLOB is a traditional K&R/UNIX allocator with a SLAB emulation layer,
similar to the original Linux kmalloc allocator that SLAB replaced.  It's
signicantly smaller code and is more memory efficient.  But like all
similar allocators, it scales poorly and suffers from fragmentation more
than SLAB, so it's only appropriate for small systems.

It's been tested extensively in the Linux-tiny tree.  I've also
stress-tested it with make -j 8 compiles on a 3G SMP+PREEMPT box (not
recommended).

Here's a comparison for otherwise identical builds, showing SLOB saving
nearly half a megabyte of RAM:

$ size vmlinux*
   text    data     bss     dec     hex filename
3336372  529360  190812 4056544  3de5e0 vmlinux-slab
3323208  527948  190684 4041840  3dac70 vmlinux-slob

$ size mm/{slab,slob}.o
   text    data     bss     dec     hex filename
  13221     752      48   14021    36c5 mm/slab.o
   1896      52       8    1956     7a4 mm/slob.o

/proc/meminfo:
                  SLAB          SLOB      delta
MemTotal:        27964 kB      27980 kB     +16 kB
MemFree:         24596 kB      25092 kB    +496 kB
Buffers:            36 kB         36 kB       0 kB
Cached:           1188 kB       1188 kB       0 kB
SwapCached:          0 kB          0 kB       0 kB
Active:            608 kB        600 kB      -8 kB
Inactive:          808 kB        812 kB      +4 kB
HighTotal:           0 kB          0 kB       0 kB
HighFree:            0 kB          0 kB       0 kB
LowTotal:        27964 kB      27980 kB     +16 kB
LowFree:         24596 kB      25092 kB    +496 kB
SwapTotal:           0 kB          0 kB       0 kB
SwapFree:            0 kB          0 kB       0 kB
Dirty:               4 kB         12 kB      +8 kB
Writeback:           0 kB          0 kB       0 kB
Mapped:            560 kB        556 kB      -4 kB
Slab:             1756 kB          0 kB   -1756 kB
CommitLimit:     13980 kB      13988 kB      +8 kB
Committed_AS:     4208 kB       4208 kB       0 kB
PageTables:         28 kB         28 kB       0 kB
VmallocTotal:  1007312 kB    1007312 kB       0 kB
VmallocUsed:        48 kB         48 kB       0 kB
VmallocChunk:  1007264 kB    1007264 kB       0 kB

(this work has been sponsored in part by CELF)

From: Ingo Molnar <mingo@elte.hu>

   Fix 32-bitness bugs in mm/slob.c.

Signed-off-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/proc_misc.c  |   4 +
 include/linux/slab.h |  35 +++++
 init/Kconfig         |  13 ++
 mm/Makefile          |   4 +-
 mm/slob.c            | 385 +++++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 440 insertions(+), 1 deletion(-)
 create mode 100644 mm/slob.c

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 5b6b0b6038..63bf6c00fa 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -323,6 +323,7 @@ static struct file_operations proc_modules_operations = {
 };
 #endif
 
+#ifdef CONFIG_SLAB
 extern struct seq_operations slabinfo_op;
 extern ssize_t slabinfo_write(struct file *, const char __user *, size_t, loff_t *);
 static int slabinfo_open(struct inode *inode, struct file *file)
@@ -336,6 +337,7 @@ static struct file_operations proc_slabinfo_operations = {
 	.llseek		= seq_lseek,
 	.release	= seq_release,
 };
+#endif
 
 static int show_stat(struct seq_file *p, void *v)
 {
@@ -600,7 +602,9 @@ void __init proc_misc_init(void)
 	create_seq_entry("partitions", 0, &proc_partitions_operations);
 	create_seq_entry("stat", 0, &proc_stat_operations);
 	create_seq_entry("interrupts", 0, &proc_interrupts_operations);
+#ifdef CONFIG_SLAB
 	create_seq_entry("slabinfo",S_IWUSR|S_IRUGO,&proc_slabinfo_operations);
+#endif
 	create_seq_entry("buddyinfo",S_IRUGO, &fragmentation_file_operations);
 	create_seq_entry("vmstat",S_IRUGO, &proc_vmstat_file_operations);
 	create_seq_entry("zoneinfo",S_IRUGO, &proc_zoneinfo_file_operations);
diff --git a/include/linux/slab.h b/include/linux/slab.h
index d1ea4051b9..1fb77a9cc1 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -53,6 +53,8 @@ typedef struct kmem_cache kmem_cache_t;
 #define SLAB_CTOR_ATOMIC	0x002UL		/* tell constructor it can't sleep */
 #define	SLAB_CTOR_VERIFY	0x004UL		/* tell constructor it's a verify call */
 
+#ifndef CONFIG_SLOB
+
 /* prototypes */
 extern void __init kmem_cache_init(void);
 
@@ -134,6 +136,39 @@ static inline void *kmalloc_node(size_t size, gfp_t flags, int node)
 extern int FASTCALL(kmem_cache_reap(int));
 extern int FASTCALL(kmem_ptr_validate(kmem_cache_t *cachep, void *ptr));
 
+#else /* CONFIG_SLOB */
+
+/* SLOB allocator routines */
+
+void kmem_cache_init(void);
+struct kmem_cache *kmem_find_general_cachep(size_t, gfp_t gfpflags);
+struct kmem_cache *kmem_cache_create(const char *c, size_t, size_t,
+	unsigned long,
+	void (*)(void *, struct kmem_cache *, unsigned long),
+	void (*)(void *, struct kmem_cache *, unsigned long));
+int kmem_cache_destroy(struct kmem_cache *c);
+void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags);
+void kmem_cache_free(struct kmem_cache *c, void *b);
+const char *kmem_cache_name(struct kmem_cache *);
+void *kmalloc(size_t size, gfp_t flags);
+void *kzalloc(size_t size, gfp_t flags);
+void kfree(const void *m);
+unsigned int ksize(const void *m);
+unsigned int kmem_cache_size(struct kmem_cache *c);
+
+static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
+{
+	return kzalloc(n * size, flags);
+}
+
+#define kmem_cache_shrink(d) (0)
+#define kmem_cache_reap(a)
+#define kmem_ptr_validate(a, b) (0)
+#define kmem_cache_alloc_node(c, f, n) kmem_cache_alloc(c, f)
+#define kmalloc_node(s, f, n) kmalloc(s, f)
+
+#endif /* CONFIG_SLOB */
+
 /* System wide caches */
 extern kmem_cache_t	*vm_area_cachep;
 extern kmem_cache_t	*names_cachep;
diff --git a/init/Kconfig b/init/Kconfig
index ba42f3793a..0c9932f9f0 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -380,6 +380,15 @@ config CC_ALIGN_JUMPS
 	  no dummy operations need be executed.
 	  Zero means use compiler's default.
 
+config SLAB
+	default y
+	bool "Use full SLAB allocator" if EMBEDDED
+	help
+	  Disabling this replaces the advanced SLAB allocator and
+	  kmalloc support with the drastically simpler SLOB allocator.
+	  SLOB is more space efficient but does not scale well and is
+	  more susceptible to fragmentation.
+
 endmenu		# General setup
 
 config TINY_SHMEM
@@ -391,6 +400,10 @@ config BASE_SMALL
 	default 0 if BASE_FULL
 	default 1 if !BASE_FULL
 
+config SLOB
+	default !SLAB
+	bool
+
 menu "Loadable module support"
 
 config MODULES
diff --git a/mm/Makefile b/mm/Makefile
index 74c85ddc91..9aa03fa1dc 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -9,7 +9,7 @@ mmu-$(CONFIG_MMU)	:= fremap.o highmem.o madvise.o memory.o mincore.o \
 
 obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
 			   page_alloc.o page-writeback.o pdflush.o \
-			   readahead.o slab.o swap.o truncate.o vmscan.o \
+			   readahead.o swap.o truncate.o vmscan.o \
 			   prio_tree.o util.o $(mmu-y)
 
 obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o thrash.o
@@ -18,5 +18,7 @@ obj-$(CONFIG_NUMA) 	+= mempolicy.o
 obj-$(CONFIG_SPARSEMEM)	+= sparse.o
 obj-$(CONFIG_SHMEM) += shmem.o
 obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
+obj-$(CONFIG_SLOB) += slob.o
+obj-$(CONFIG_SLAB) += slab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_FS_XIP) += filemap_xip.o
diff --git a/mm/slob.c b/mm/slob.c
new file mode 100644
index 0000000000..1c240c4b71
--- /dev/null
+++ b/mm/slob.c
@@ -0,0 +1,385 @@
+/*
+ * SLOB Allocator: Simple List Of Blocks
+ *
+ * Matt Mackall <mpm@selenic.com> 12/30/03
+ *
+ * How SLOB works:
+ *
+ * The core of SLOB is a traditional K&R style heap allocator, with
+ * support for returning aligned objects. The granularity of this
+ * allocator is 8 bytes on x86, though it's perhaps possible to reduce
+ * this to 4 if it's deemed worth the effort. The slob heap is a
+ * singly-linked list of pages from __get_free_page, grown on demand
+ * and allocation from the heap is currently first-fit.
+ *
+ * Above this is an implementation of kmalloc/kfree. Blocks returned
+ * from kmalloc are 8-byte aligned and prepended with a 8-byte header.
+ * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls
+ * __get_free_pages directly so that it can return page-aligned blocks
+ * and keeps a linked list of such pages and their orders. These
+ * objects are detected in kfree() by their page alignment.
+ *
+ * SLAB is emulated on top of SLOB by simply calling constructors and
+ * destructors for every SLAB allocation. Objects are returned with
+ * the 8-byte alignment unless the SLAB_MUST_HWCACHE_ALIGN flag is
+ * set, in which case the low-level allocator will fragment blocks to
+ * create the proper alignment. Again, objects of page-size or greater
+ * are allocated by calling __get_free_pages. As SLAB objects know
+ * their size, no separate size bookkeeping is necessary and there is
+ * essentially no allocation space overhead.
+ */
+
+#include <linux/config.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/cache.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/timer.h>
+
+struct slob_block {
+	int units;
+	struct slob_block *next;
+};
+typedef struct slob_block slob_t;
+
+#define SLOB_UNIT sizeof(slob_t)
+#define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT)
+#define SLOB_ALIGN L1_CACHE_BYTES
+
+struct bigblock {
+	int order;
+	void *pages;
+	struct bigblock *next;
+};
+typedef struct bigblock bigblock_t;
+
+static slob_t arena = { .next = &arena, .units = 1 };
+static slob_t *slobfree = &arena;
+static bigblock_t *bigblocks;
+static DEFINE_SPINLOCK(slob_lock);
+static DEFINE_SPINLOCK(block_lock);
+
+static void slob_free(void *b, int size);
+
+static void *slob_alloc(size_t size, gfp_t gfp, int align)
+{
+	slob_t *prev, *cur, *aligned = 0;
+	int delta = 0, units = SLOB_UNITS(size);
+	unsigned long flags;
+
+	spin_lock_irqsave(&slob_lock, flags);
+	prev = slobfree;
+	for (cur = prev->next; ; prev = cur, cur = cur->next) {
+		if (align) {
+			aligned = (slob_t *)ALIGN((unsigned long)cur, align);
+			delta = aligned - cur;
+		}
+		if (cur->units >= units + delta) { /* room enough? */
+			if (delta) { /* need to fragment head to align? */
+				aligned->units = cur->units - delta;
+				aligned->next = cur->next;
+				cur->next = aligned;
+				cur->units = delta;
+				prev = cur;
+				cur = aligned;
+			}
+
+			if (cur->units == units) /* exact fit? */
+				prev->next = cur->next; /* unlink */
+			else { /* fragment */
+				prev->next = cur + units;
+				prev->next->units = cur->units - units;
+				prev->next->next = cur->next;
+				cur->units = units;
+			}
+
+			slobfree = prev;
+			spin_unlock_irqrestore(&slob_lock, flags);
+			return cur;
+		}
+		if (cur == slobfree) {
+			spin_unlock_irqrestore(&slob_lock, flags);
+
+			if (size == PAGE_SIZE) /* trying to shrink arena? */
+				return 0;
+
+			cur = (slob_t *)__get_free_page(gfp);
+			if (!cur)
+				return 0;
+
+			slob_free(cur, PAGE_SIZE);
+			spin_lock_irqsave(&slob_lock, flags);
+			cur = slobfree;
+		}
+	}
+}
+
+static void slob_free(void *block, int size)
+{
+	slob_t *cur, *b = (slob_t *)block;
+	unsigned long flags;
+
+	if (!block)
+		return;
+
+	if (size)
+		b->units = SLOB_UNITS(size);
+
+	/* Find reinsertion point */
+	spin_lock_irqsave(&slob_lock, flags);
+	for (cur = slobfree; !(b > cur && b < cur->next); cur = cur->next)
+		if (cur >= cur->next && (b > cur || b < cur->next))
+			break;
+
+	if (b + b->units == cur->next) {
+		b->units += cur->next->units;
+		b->next = cur->next->next;
+	} else
+		b->next = cur->next;
+
+	if (cur + cur->units == b) {
+		cur->units += b->units;
+		cur->next = b->next;
+	} else
+		cur->next = b;
+
+	slobfree = cur;
+
+	spin_unlock_irqrestore(&slob_lock, flags);
+}
+
+static int FASTCALL(find_order(int size));
+static int fastcall find_order(int size)
+{
+	int order = 0;
+	for ( ; size > 4096 ; size >>=1)
+		order++;
+	return order;
+}
+
+void *kmalloc(size_t size, gfp_t gfp)
+{
+	slob_t *m;
+	bigblock_t *bb;
+	unsigned long flags;
+
+	if (size < PAGE_SIZE - SLOB_UNIT) {
+		m = slob_alloc(size + SLOB_UNIT, gfp, 0);
+		return m ? (void *)(m + 1) : 0;
+	}
+
+	bb = slob_alloc(sizeof(bigblock_t), gfp, 0);
+	if (!bb)
+		return 0;
+
+	bb->order = find_order(size);
+	bb->pages = (void *)__get_free_pages(gfp, bb->order);
+
+	if (bb->pages) {
+		spin_lock_irqsave(&block_lock, flags);
+		bb->next = bigblocks;
+		bigblocks = bb;
+		spin_unlock_irqrestore(&block_lock, flags);
+		return bb->pages;
+	}
+
+	slob_free(bb, sizeof(bigblock_t));
+	return 0;
+}
+
+EXPORT_SYMBOL(kmalloc);
+
+void kfree(const void *block)
+{
+	bigblock_t *bb, **last = &bigblocks;
+	unsigned long flags;
+
+	if (!block)
+		return;
+
+	if (!((unsigned long)block & (PAGE_SIZE-1))) {
+		/* might be on the big block list */
+		spin_lock_irqsave(&block_lock, flags);
+		for (bb = bigblocks; bb; last = &bb->next, bb = bb->next) {
+			if (bb->pages == block) {
+				*last = bb->next;
+				spin_unlock_irqrestore(&block_lock, flags);
+				free_pages((unsigned long)block, bb->order);
+				slob_free(bb, sizeof(bigblock_t));
+				return;
+			}
+		}
+		spin_unlock_irqrestore(&block_lock, flags);
+	}
+
+	slob_free((slob_t *)block - 1, 0);
+	return;
+}
+
+EXPORT_SYMBOL(kfree);
+
+unsigned int ksize(const void *block)
+{
+	bigblock_t *bb;
+	unsigned long flags;
+
+	if (!block)
+		return 0;
+
+	if (!((unsigned long)block & (PAGE_SIZE-1))) {
+		spin_lock_irqsave(&block_lock, flags);
+		for (bb = bigblocks; bb; bb = bb->next)
+			if (bb->pages == block) {
+				spin_unlock_irqrestore(&slob_lock, flags);
+				return PAGE_SIZE << bb->order;
+			}
+		spin_unlock_irqrestore(&block_lock, flags);
+	}
+
+	return ((slob_t *)block - 1)->units * SLOB_UNIT;
+}
+
+struct kmem_cache {
+	unsigned int size, align;
+	const char *name;
+	void (*ctor)(void *, struct kmem_cache *, unsigned long);
+	void (*dtor)(void *, struct kmem_cache *, unsigned long);
+};
+
+struct kmem_cache *kmem_cache_create(const char *name, size_t size,
+	size_t align, unsigned long flags,
+	void (*ctor)(void*, struct kmem_cache *, unsigned long),
+	void (*dtor)(void*, struct kmem_cache *, unsigned long))
+{
+	struct kmem_cache *c;
+
+	c = slob_alloc(sizeof(struct kmem_cache), flags, 0);
+
+	if (c) {
+		c->name = name;
+		c->size = size;
+		c->ctor = ctor;
+		c->dtor = dtor;
+		/* ignore alignment unless it's forced */
+		c->align = (flags & SLAB_MUST_HWCACHE_ALIGN) ? SLOB_ALIGN : 0;
+		if (c->align < align)
+			c->align = align;
+	}
+
+	return c;
+}
+EXPORT_SYMBOL(kmem_cache_create);
+
+int kmem_cache_destroy(struct kmem_cache *c)
+{
+	slob_free(c, sizeof(struct kmem_cache));
+	return 0;
+}
+EXPORT_SYMBOL(kmem_cache_destroy);
+
+void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags)
+{
+	void *b;
+
+	if (c->size < PAGE_SIZE)
+		b = slob_alloc(c->size, flags, c->align);
+	else
+		b = (void *)__get_free_pages(flags, find_order(c->size));
+
+	if (c->ctor)
+		c->ctor(b, c, SLAB_CTOR_CONSTRUCTOR);
+
+	return b;
+}
+EXPORT_SYMBOL(kmem_cache_alloc);
+
+void kmem_cache_free(struct kmem_cache *c, void *b)
+{
+	if (c->dtor)
+		c->dtor(b, c, 0);
+
+	if (c->size < PAGE_SIZE)
+		slob_free(b, c->size);
+	else
+		free_pages((unsigned long)b, find_order(c->size));
+}
+EXPORT_SYMBOL(kmem_cache_free);
+
+unsigned int kmem_cache_size(struct kmem_cache *c)
+{
+	return c->size;
+}
+EXPORT_SYMBOL(kmem_cache_size);
+
+const char *kmem_cache_name(struct kmem_cache *c)
+{
+	return c->name;
+}
+EXPORT_SYMBOL(kmem_cache_name);
+
+static struct timer_list slob_timer = TIMER_INITIALIZER(
+	(void (*)(unsigned long))kmem_cache_init, 0, 0);
+
+void kmem_cache_init(void)
+{
+	void *p = slob_alloc(PAGE_SIZE, 0, PAGE_SIZE-1);
+
+	if (p)
+		free_page((unsigned long)p);
+
+	mod_timer(&slob_timer, jiffies + HZ);
+}
+
+atomic_t slab_reclaim_pages = ATOMIC_INIT(0);
+EXPORT_SYMBOL(slab_reclaim_pages);
+
+#ifdef CONFIG_SMP
+
+void *__alloc_percpu(size_t size, size_t align)
+{
+	int i;
+	struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL);
+
+	if (!pdata)
+		return NULL;
+
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!cpu_possible(i))
+			continue;
+		pdata->ptrs[i] = kmalloc(size, GFP_KERNEL);
+		if (!pdata->ptrs[i])
+			goto unwind_oom;
+		memset(pdata->ptrs[i], 0, size);
+	}
+
+	/* Catch derefs w/o wrappers */
+	return (void *) (~(unsigned long) pdata);
+
+unwind_oom:
+	while (--i >= 0) {
+		if (!cpu_possible(i))
+			continue;
+		kfree(pdata->ptrs[i]);
+	}
+	kfree(pdata);
+	return NULL;
+}
+EXPORT_SYMBOL(__alloc_percpu);
+
+void
+free_percpu(const void *objp)
+{
+	int i;
+	struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp);
+
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!cpu_possible(i))
+			continue;
+		kfree(p->ptrs[i]);
+	}
+	kfree(p);
+}
+EXPORT_SYMBOL(free_percpu);
+
+#endif
-- 
cgit v1.2.2


From 96b7f34143c2c823a6a750fcb758fc66c44945d2 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:01:46 -0800
Subject: [PATCH] cpuset: better bitmap remap defaults

Fix the default behaviour for the remap operators in bitmap, cpumask and
nodemask.

As previously submitted, the pair of masks <A, B> defined a map of the
positions of the set bits in A to the corresponding bits in B.  This is still
true.

The issue is how to map the other positions, corresponding to the unset (0)
bits in A.  As previously submitted, they were all mapped to the first set bit
position in B, a constant map.

When I tried to code per-vma mempolicy rebinding using these remap operators,
I realized this was wrong.

This patch changes the default to map all the unset bit positions in A to the
same positions in B, the identity map.

For example, if A has bits 4-7 set, and B has bits 9-12 set, then the map
defined by the pair <A, B> maps each bit position in the first 32 bits as
follows:

	0 ==> 0
	  ...
	3 ==> 3
	4 ==> 9
	  ...
	7 ==> 12
	8 ==> 8
	9 ==> 9
	  ...
	31 ==> 31

This now corresponds to the typical behaviour desired when migrating pages and
policies from one cpuset to another.

The pages on nodes within the original cpuset, and the references in memory
policies to nodes within the original cpuset, are migrated to the
corresponding cpuset-relative nodes in the destination cpuset.  Other pages
and node references are left untouched.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 lib/bitmap.c | 89 ++++++++++++++++++++++++++++++------------------------------
 1 file changed, 44 insertions(+), 45 deletions(-)

diff --git a/lib/bitmap.c b/lib/bitmap.c
index 23d3b1147f..48e708381d 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -519,7 +519,7 @@ EXPORT_SYMBOL(bitmap_parselist);
  *
  * Map the bit at position @pos in @buf (of length @bits) to the
  * ordinal of which set bit it is.  If it is not set or if @pos
- * is not a valid bit position, map to zero (0).
+ * is not a valid bit position, map to -1.
  *
  * If for example, just bits 4 through 7 are set in @buf, then @pos
  * values 4 through 7 will get mapped to 0 through 3, respectively,
@@ -531,18 +531,19 @@ EXPORT_SYMBOL(bitmap_parselist);
  */
 static int bitmap_pos_to_ord(const unsigned long *buf, int pos, int bits)
 {
-	int ord = 0;
+	int i, ord;
 
-	if (pos >= 0 && pos < bits) {
-		int i;
+	if (pos < 0 || pos >= bits || !test_bit(pos, buf))
+		return -1;
 
-		for (i = find_first_bit(buf, bits);
-		     i < pos;
-		     i = find_next_bit(buf, bits, i + 1))
-	     		ord++;
-		if (i > pos)
-			ord = 0;
+	i = find_first_bit(buf, bits);
+	ord = 0;
+	while (i < pos) {
+		i = find_next_bit(buf, bits, i + 1);
+	     	ord++;
 	}
+	BUG_ON(i != pos);
+
 	return ord;
 }
 
@@ -553,11 +554,12 @@ static int bitmap_pos_to_ord(const unsigned long *buf, int pos, int bits)
  *	@bits: number of valid bit positions in @buf
  *
  * Map the ordinal offset of bit @ord in @buf to its position in @buf.
- * If @ord is not the ordinal offset of a set bit in @buf, map to zero (0).
+ * Value of @ord should be in range 0 <= @ord < weight(buf), else
+ * results are undefined.
  *
  * If for example, just bits 4 through 7 are set in @buf, then @ord
  * values 0 through 3 will get mapped to 4 through 7, respectively,
- * and all other @ord valuds will get mapped to 0.  When @ord value 3
+ * and all other @ord values return undefined values.  When @ord value 3
  * gets mapped to (returns) @pos value 7 in this example, that means
  * that the 3rd set bit (starting with 0th) is at position 7 in @buf.
  *
@@ -583,8 +585,8 @@ static int bitmap_ord_to_pos(const unsigned long *buf, int ord, int bits)
 
 /**
  * bitmap_remap - Apply map defined by a pair of bitmaps to another bitmap
- *	@src: subset to be remapped
  *	@dst: remapped result
+ *	@src: subset to be remapped
  *	@old: defines domain of map
  *	@new: defines range of map
  *	@bits: number of bits in each of these bitmaps
@@ -596,49 +598,42 @@ static int bitmap_ord_to_pos(const unsigned long *buf, int ord, int bits)
  * weight of @old, map the position of the n-th set bit in @old to
  * the position of the m-th set bit in @new, where m == n % w.
  *
- * If either of the @old and @new bitmaps are empty, or if@src and @dst
- * point to the same location, then this routine does nothing.
+ * If either of the @old and @new bitmaps are empty, or if @src and
+ * @dst point to the same location, then this routine copies @src
+ * to @dst.
  *
- * The positions of unset bits in @old are mapped to the position of
- * the first set bit in @new.
+ * The positions of unset bits in @old are mapped to themselves
+ * (the identify map).
  *
  * Apply the above specified mapping to @src, placing the result in
  * @dst, clearing any bits previously set in @dst.
  *
- * The resulting value of @dst will have either the same weight as
- * @src, or less weight in the general case that the mapping wasn't
- * injective due to the weight of @new being less than that of @old.
- * The resulting value of @dst will never have greater weight than
- * that of @src, except perhaps in the case that one of the above
- * conditions was not met and this routine just returned.
- *
  * For example, lets say that @old has bits 4 through 7 set, and
  * @new has bits 12 through 15 set.  This defines the mapping of bit
  * position 4 to 12, 5 to 13, 6 to 14 and 7 to 15, and of all other
- * bit positions to 12 (the first set bit in @new.  So if say @src
- * comes into this routine with bits 1, 5 and 7 set, then @dst should
- * leave with bits 12, 13 and 15 set.
+ * bit positions unchanged.  So if say @src comes into this routine
+ * with bits 1, 5 and 7 set, then @dst should leave with bits 1,
+ * 13 and 15 set.
  */
 void bitmap_remap(unsigned long *dst, const unsigned long *src,
 		const unsigned long *old, const unsigned long *new,
 		int bits)
 {
-	int s;
+	int oldbit, w;
 
-	if (bitmap_weight(old, bits) == 0)
-		return;
-	if (bitmap_weight(new, bits) == 0)
-		return;
 	if (dst == src)		/* following doesn't handle inplace remaps */
 		return;
-
 	bitmap_zero(dst, bits);
-	for (s = find_first_bit(src, bits);
-	     s < bits;
-	     s = find_next_bit(src, bits, s + 1)) {
-	     	int x = bitmap_pos_to_ord(old, s, bits);
-		int y = bitmap_ord_to_pos(new, x, bits);
-		set_bit(y, dst);
+
+	w = bitmap_weight(new, bits);
+	for (oldbit = find_first_bit(src, bits);
+	     oldbit < bits;
+	     oldbit = find_next_bit(src, bits, oldbit + 1)) {
+	     	int n = bitmap_pos_to_ord(old, oldbit, bits);
+		if (n < 0 || w == 0)
+			set_bit(oldbit, dst);	/* identity map */
+		else
+			set_bit(bitmap_ord_to_pos(new, n % w, bits), dst);
 	}
 }
 EXPORT_SYMBOL(bitmap_remap);
@@ -657,8 +652,8 @@ EXPORT_SYMBOL(bitmap_remap);
  * weight of @old, map the position of the n-th set bit in @old to
  * the position of the m-th set bit in @new, where m == n % w.
  *
- * The positions of unset bits in @old are mapped to the position of
- * the first set bit in @new.
+ * The positions of unset bits in @old are mapped to themselves
+ * (the identify map).
  *
  * Apply the above specified mapping to bit position @oldbit, returning
  * the new bit position.
@@ -666,14 +661,18 @@ EXPORT_SYMBOL(bitmap_remap);
  * For example, lets say that @old has bits 4 through 7 set, and
  * @new has bits 12 through 15 set.  This defines the mapping of bit
  * position 4 to 12, 5 to 13, 6 to 14 and 7 to 15, and of all other
- * bit positions to 12 (the first set bit in @new.  So if say @oldbit
- * is 5, then this routine returns 13.
+ * bit positions unchanged.  So if say @oldbit is 5, then this routine
+ * returns 13.
  */
 int bitmap_bitremap(int oldbit, const unsigned long *old,
 				const unsigned long *new, int bits)
 {
-	int x = bitmap_pos_to_ord(old, oldbit, bits);
-	return bitmap_ord_to_pos(new, x, bits);
+	int w = bitmap_weight(new, bits);
+	int n = bitmap_pos_to_ord(old, oldbit, bits);
+	if (n < 0 || w == 0)
+		return oldbit;
+	else
+		return bitmap_ord_to_pos(new, n % w, bits);
 }
 EXPORT_SYMBOL(bitmap_bitremap);
 
-- 
cgit v1.2.2


From 5966514db662fb24c9bb43226a80106bcffd51f8 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:01:47 -0800
Subject: [PATCH] cpuset: mempolicy one more nodemask conversion

Finish converting mm/mempolicy.c from bitmaps to nodemasks.  The previous
conversion had left one routine using bitmaps, since it involved a
corresponding change to kernel/cpuset.c

Fix that interface by replacing with a simple macro that calls nodes_subset(),
or if !CONFIG_CPUSET, returns (1).

Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Christoph Lameter <christoph@lameter.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/cpuset.h |  5 +++--
 kernel/cpuset.c        | 10 ----------
 mm/mempolicy.c         |  5 ++---
 3 files changed, 5 insertions(+), 15 deletions(-)

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 6e2deef96b..8b21786490 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -21,7 +21,8 @@ extern void cpuset_exit(struct task_struct *p);
 extern cpumask_t cpuset_cpus_allowed(const struct task_struct *p);
 void cpuset_init_current_mems_allowed(void);
 void cpuset_update_current_mems_allowed(void);
-void cpuset_restrict_to_mems_allowed(unsigned long *nodes);
+#define cpuset_nodes_subset_current_mems_allowed(nodes) \
+		nodes_subset((nodes), current->mems_allowed)
 int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl);
 extern int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask);
 extern int cpuset_excl_nodes_overlap(const struct task_struct *p);
@@ -42,7 +43,7 @@ static inline cpumask_t cpuset_cpus_allowed(struct task_struct *p)
 
 static inline void cpuset_init_current_mems_allowed(void) {}
 static inline void cpuset_update_current_mems_allowed(void) {}
-static inline void cpuset_restrict_to_mems_allowed(unsigned long *nodes) {}
+#define cpuset_nodes_subset_current_mems_allowed(nodes) (1)
 
 static inline int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
 {
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f63383e01e..6503c6da4c 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1749,16 +1749,6 @@ done:
 		refresh_mems();
 }
 
-/**
- * cpuset_restrict_to_mems_allowed - limit nodes to current mems_allowed
- * @nodes: pointer to a node bitmap that is and-ed with mems_allowed
- */
-void cpuset_restrict_to_mems_allowed(unsigned long *nodes)
-{
-	bitmap_and(nodes, nodes, nodes_addr(current->mems_allowed),
-							MAX_NUMNODES);
-}
-
 /**
  * cpuset_zonelist_valid_mems_allowed - check zonelist vs. curremt mems_allowed
  * @zl: the zonelist to be checked
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 7051fe450e..9dea2b8a7d 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -387,10 +387,9 @@ static int contextualize_policy(int mode, nodemask_t *nodes)
 	if (!nodes)
 		return 0;
 
-	/* Update current mems_allowed */
 	cpuset_update_current_mems_allowed();
-	/* Ignore nodes not set in current->mems_allowed */
-	cpuset_restrict_to_mems_allowed(nodes->bits);
+	if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
+		return -EINVAL;
 	return mpol_check_policy(mode, nodes);
 }
 
-- 
cgit v1.2.2


From 3e0d98b9f1eb757fc98efc84e74e54a08308aa73 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:01:49 -0800
Subject: [PATCH] cpuset: memory pressure meter

Provide a simple per-cpuset metric of memory pressure, tracking the -rate-
that the tasks in a cpuset call try_to_free_pages(), the synchronous
(direct) memory reclaim code.

This enables batch managers monitoring jobs running in dedicated cpusets to
efficiently detect what level of memory pressure that job is causing.

This is useful both on tightly managed systems running a wide mix of
submitted jobs, which may choose to terminate or reprioritize jobs that are
trying to use more memory than allowed on the nodes assigned them, and with
tightly coupled, long running, massively parallel scientific computing jobs
that will dramatically fail to meet required performance goals if they
start to use more memory than allowed to them.

This patch just provides a very economical way for the batch manager to
monitor a cpuset for signs of memory pressure.  It's up to the batch
manager or other user code to decide what to do about it and take action.

==> Unless this feature is enabled by writing "1" to the special file
    /dev/cpuset/memory_pressure_enabled, the hook in the rebalance
    code of __alloc_pages() for this metric reduces to simply noticing
    that the cpuset_memory_pressure_enabled flag is zero.  So only
    systems that enable this feature will compute the metric.

Why a per-cpuset, running average:

    Because this meter is per-cpuset, rather than per-task or mm, the
    system load imposed by a batch scheduler monitoring this metric is
    sharply reduced on large systems, because a scan of the tasklist can be
    avoided on each set of queries.

    Because this meter is a running average, instead of an accumulating
    counter, a batch scheduler can detect memory pressure with a single
    read, instead of having to read and accumulate results for a period of
    time.

    Because this meter is per-cpuset rather than per-task or mm, the
    batch scheduler can obtain the key information, memory pressure in a
    cpuset, with a single read, rather than having to query and accumulate
    results over all the (dynamically changing) set of tasks in the cpuset.

A per-cpuset simple digital filter (requires a spinlock and 3 words of data
per-cpuset) is kept, and updated by any task attached to that cpuset, if it
enters the synchronous (direct) page reclaim code.

A per-cpuset file provides an integer number representing the recent
(half-life of 10 seconds) rate of direct page reclaims caused by the tasks
in the cpuset, in units of reclaims attempted per second, times 1000.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/cpuset.h |  11 +++
 kernel/cpuset.c        | 193 ++++++++++++++++++++++++++++++++++++++++++++++++-
 mm/page_alloc.c        |   1 +
 3 files changed, 203 insertions(+), 2 deletions(-)

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 8b21786490..736d73801c 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -26,6 +26,15 @@ void cpuset_update_current_mems_allowed(void);
 int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl);
 extern int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask);
 extern int cpuset_excl_nodes_overlap(const struct task_struct *p);
+
+#define cpuset_memory_pressure_bump() 				\
+	do {							\
+		if (cpuset_memory_pressure_enabled)		\
+			__cpuset_memory_pressure_bump();	\
+	} while (0)
+extern int cpuset_memory_pressure_enabled;
+extern void __cpuset_memory_pressure_bump(void);
+
 extern struct file_operations proc_cpuset_operations;
 extern char *cpuset_task_status_allowed(struct task_struct *task, char *buffer);
 
@@ -60,6 +69,8 @@ static inline int cpuset_excl_nodes_overlap(const struct task_struct *p)
 	return 1;
 }
 
+static inline void cpuset_memory_pressure_bump(void) {}
+
 static inline char *cpuset_task_status_allowed(struct task_struct *task,
 							char *buffer)
 {
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 6503c6da4c..5a06fef669 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -56,6 +56,15 @@
 
 #define CPUSET_SUPER_MAGIC 		0x27e0eb
 
+/* See "Frequency meter" comments, below. */
+
+struct fmeter {
+	int cnt;		/* unprocessed events count */
+	int val;		/* most recent output value */
+	time_t time;		/* clock (secs) when val computed */
+	spinlock_t lock;	/* guards read or write of above */
+};
+
 struct cpuset {
 	unsigned long flags;		/* "unsigned long" so bitops work */
 	cpumask_t cpus_allowed;		/* CPUs allowed to tasks in cpuset */
@@ -80,7 +89,9 @@ struct cpuset {
 	 * Copy of global cpuset_mems_generation as of the most
 	 * recent time this cpuset changed its mems_allowed.
 	 */
-	 int mems_generation;
+	int mems_generation;
+
+	struct fmeter fmeter;		/* memory_pressure filter */
 };
 
 /* bits in struct cpuset flags field */
@@ -149,7 +160,7 @@ static struct cpuset top_cpuset = {
 };
 
 static struct vfsmount *cpuset_mount;
-static struct super_block *cpuset_sb = NULL;
+static struct super_block *cpuset_sb;
 
 /*
  * We have two global cpuset semaphores below.  They can nest.
@@ -806,6 +817,19 @@ static int update_nodemask(struct cpuset *cs, char *buf)
 	return retval;
 }
 
+/*
+ * Call with manage_sem held.
+ */
+
+static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
+{
+	if (simple_strtoul(buf, NULL, 10) != 0)
+		cpuset_memory_pressure_enabled = 1;
+	else
+		cpuset_memory_pressure_enabled = 0;
+	return 0;
+}
+
 /*
  * update_flag - read a 0 or a 1 in a file and update associated flag
  * bit:	the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
@@ -847,6 +871,104 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
 	return 0;
 }
 
+/*
+ * Frequency meter - How fast is some event occuring?
+ *
+ * These routines manage a digitally filtered, constant time based,
+ * event frequency meter.  There are four routines:
+ *   fmeter_init() - initialize a frequency meter.
+ *   fmeter_markevent() - called each time the event happens.
+ *   fmeter_getrate() - returns the recent rate of such events.
+ *   fmeter_update() - internal routine used to update fmeter.
+ *
+ * A common data structure is passed to each of these routines,
+ * which is used to keep track of the state required to manage the
+ * frequency meter and its digital filter.
+ *
+ * The filter works on the number of events marked per unit time.
+ * The filter is single-pole low-pass recursive (IIR).  The time unit
+ * is 1 second.  Arithmetic is done using 32-bit integers scaled to
+ * simulate 3 decimal digits of precision (multiplied by 1000).
+ *
+ * With an FM_COEF of 933, and a time base of 1 second, the filter
+ * has a half-life of 10 seconds, meaning that if the events quit
+ * happening, then the rate returned from the fmeter_getrate()
+ * will be cut in half each 10 seconds, until it converges to zero.
+ *
+ * It is not worth doing a real infinitely recursive filter.  If more
+ * than FM_MAXTICKS ticks have elapsed since the last filter event,
+ * just compute FM_MAXTICKS ticks worth, by which point the level
+ * will be stable.
+ *
+ * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
+ * arithmetic overflow in the fmeter_update() routine.
+ *
+ * Given the simple 32 bit integer arithmetic used, this meter works
+ * best for reporting rates between one per millisecond (msec) and
+ * one per 32 (approx) seconds.  At constant rates faster than one
+ * per msec it maxes out at values just under 1,000,000.  At constant
+ * rates between one per msec, and one per second it will stabilize
+ * to a value N*1000, where N is the rate of events per second.
+ * At constant rates between one per second and one per 32 seconds,
+ * it will be choppy, moving up on the seconds that have an event,
+ * and then decaying until the next event.  At rates slower than
+ * about one in 32 seconds, it decays all the way back to zero between
+ * each event.
+ */
+
+#define FM_COEF 933		/* coefficient for half-life of 10 secs */
+#define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */
+#define FM_MAXCNT 1000000	/* limit cnt to avoid overflow */
+#define FM_SCALE 1000		/* faux fixed point scale */
+
+/* Initialize a frequency meter */
+static void fmeter_init(struct fmeter *fmp)
+{
+	fmp->cnt = 0;
+	fmp->val = 0;
+	fmp->time = 0;
+	spin_lock_init(&fmp->lock);
+}
+
+/* Internal meter update - process cnt events and update value */
+static void fmeter_update(struct fmeter *fmp)
+{
+	time_t now = get_seconds();
+	time_t ticks = now - fmp->time;
+
+	if (ticks == 0)
+		return;
+
+	ticks = min(FM_MAXTICKS, ticks);
+	while (ticks-- > 0)
+		fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
+	fmp->time = now;
+
+	fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
+	fmp->cnt = 0;
+}
+
+/* Process any previous ticks, then bump cnt by one (times scale). */
+static void fmeter_markevent(struct fmeter *fmp)
+{
+	spin_lock(&fmp->lock);
+	fmeter_update(fmp);
+	fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
+	spin_unlock(&fmp->lock);
+}
+
+/* Process any previous ticks, then return current value. */
+static int fmeter_getrate(struct fmeter *fmp)
+{
+	int val;
+
+	spin_lock(&fmp->lock);
+	fmeter_update(fmp);
+	val = fmp->val;
+	spin_unlock(&fmp->lock);
+	return val;
+}
+
 /*
  * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly
  * writing the path of the old cpuset in 'ppathbuf' if it needs to be
@@ -931,6 +1053,8 @@ typedef enum {
 	FILE_CPU_EXCLUSIVE,
 	FILE_MEM_EXCLUSIVE,
 	FILE_NOTIFY_ON_RELEASE,
+	FILE_MEMORY_PRESSURE_ENABLED,
+	FILE_MEMORY_PRESSURE,
 	FILE_TASKLIST,
 } cpuset_filetype_t;
 
@@ -984,6 +1108,12 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
 	case FILE_MEMORY_MIGRATE:
 		retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer);
 		break;
+	case FILE_MEMORY_PRESSURE_ENABLED:
+		retval = update_memory_pressure_enabled(cs, buffer);
+		break;
+	case FILE_MEMORY_PRESSURE:
+		retval = -EACCES;
+		break;
 	case FILE_TASKLIST:
 		retval = attach_task(cs, buffer, &pathbuf);
 		break;
@@ -1087,6 +1217,12 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
 	case FILE_MEMORY_MIGRATE:
 		*s++ = is_memory_migrate(cs) ? '1' : '0';
 		break;
+	case FILE_MEMORY_PRESSURE_ENABLED:
+		*s++ = cpuset_memory_pressure_enabled ? '1' : '0';
+		break;
+	case FILE_MEMORY_PRESSURE:
+		s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter));
+		break;
 	default:
 		retval = -EINVAL;
 		goto out;
@@ -1440,6 +1576,16 @@ static struct cftype cft_memory_migrate = {
 	.private = FILE_MEMORY_MIGRATE,
 };
 
+static struct cftype cft_memory_pressure_enabled = {
+	.name = "memory_pressure_enabled",
+	.private = FILE_MEMORY_PRESSURE_ENABLED,
+};
+
+static struct cftype cft_memory_pressure = {
+	.name = "memory_pressure",
+	.private = FILE_MEMORY_PRESSURE,
+};
+
 static int cpuset_populate_dir(struct dentry *cs_dentry)
 {
 	int err;
@@ -1456,6 +1602,8 @@ static int cpuset_populate_dir(struct dentry *cs_dentry)
 		return err;
 	if ((err = cpuset_add_file(cs_dentry, &cft_memory_migrate)) < 0)
 		return err;
+	if ((err = cpuset_add_file(cs_dentry, &cft_memory_pressure)) < 0)
+		return err;
 	if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0)
 		return err;
 	return 0;
@@ -1491,6 +1639,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
 	INIT_LIST_HEAD(&cs->children);
 	atomic_inc(&cpuset_mems_generation);
 	cs->mems_generation = atomic_read(&cpuset_mems_generation);
+	fmeter_init(&cs->fmeter);
 
 	cs->parent = parent;
 
@@ -1580,6 +1729,7 @@ int __init cpuset_init(void)
 	top_cpuset.cpus_allowed = CPU_MASK_ALL;
 	top_cpuset.mems_allowed = NODE_MASK_ALL;
 
+	fmeter_init(&top_cpuset.fmeter);
 	atomic_inc(&cpuset_mems_generation);
 	top_cpuset.mems_generation = atomic_read(&cpuset_mems_generation);
 
@@ -1601,6 +1751,9 @@ int __init cpuset_init(void)
 	top_cpuset.dentry = root;
 	root->d_inode->i_op = &cpuset_dir_inode_operations;
 	err = cpuset_populate_dir(root);
+	/* memory_pressure_enabled is in root cpuset only */
+	if (err == 0)
+		err = cpuset_add_file(root, &cft_memory_pressure_enabled);
 out:
 	return err;
 }
@@ -1890,6 +2043,42 @@ done:
 	return overlap;
 }
 
+/*
+ * Collection of memory_pressure is suppressed unless
+ * this flag is enabled by writing "1" to the special
+ * cpuset file 'memory_pressure_enabled' in the root cpuset.
+ */
+
+int cpuset_memory_pressure_enabled;
+
+/**
+ * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
+ *
+ * Keep a running average of the rate of synchronous (direct)
+ * page reclaim efforts initiated by tasks in each cpuset.
+ *
+ * This represents the rate at which some task in the cpuset
+ * ran low on memory on all nodes it was allowed to use, and
+ * had to enter the kernels page reclaim code in an effort to
+ * create more free memory by tossing clean pages or swapping
+ * or writing dirty pages.
+ *
+ * Display to user space in the per-cpuset read-only file
+ * "memory_pressure".  Value displayed is an integer
+ * representing the recent rate of entry into the synchronous
+ * (direct) page reclaim by any task attached to the cpuset.
+ **/
+
+void __cpuset_memory_pressure_bump(void)
+{
+	struct cpuset *cs;
+
+	task_lock(current);
+	cs = current->cpuset;
+	fmeter_markevent(&cs->fmeter);
+	task_unlock(current);
+}
+
 /*
  * proc_cpuset_show()
  *  - Print tasks cpuset path into seq_file.
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ad3d0202cd..e0e8492417 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -976,6 +976,7 @@ rebalance:
 	cond_resched();
 
 	/* We now go into synchronous reclaim */
+	cpuset_memory_pressure_bump();
 	p->flags |= PF_MEMALLOC;
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
-- 
cgit v1.2.2


From bd5e09cf7054878a3db6a8c8bab1c2fabcd4f072 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:01:50 -0800
Subject: [PATCH] cpuset: document additional features

Document the additional cpuset features:
	notify_on_release
	marker_pid
	memory_pressure
	memory_pressure_enabled

Rearrange and improve formatting of existing documentation for
cpu_exclusive and mem_exclusive features.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/cpusets.txt | 178 +++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 153 insertions(+), 25 deletions(-)

diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt
index e2d9afc30d..ef83b2dd0c 100644
--- a/Documentation/cpusets.txt
+++ b/Documentation/cpusets.txt
@@ -14,7 +14,11 @@ CONTENTS:
   1.1 What are cpusets ?
   1.2 Why are cpusets needed ?
   1.3 How are cpusets implemented ?
-  1.4 How do I use cpusets ?
+  1.4 What are exclusive cpusets ?
+  1.5 What does notify_on_release do ?
+  1.6 What is a marker_pid ?
+  1.7 What is memory_pressure ?
+  1.8 How do I use cpusets ?
 2. Usage Examples and Syntax
   2.1 Basic Usage
   2.2 Adding/removing cpus
@@ -49,29 +53,6 @@ its cpus_allowed vector, and the kernel page allocator will not
 allocate a page on a node that is not allowed in the requesting tasks
 mems_allowed vector.
 
-If a cpuset is cpu or mem exclusive, no other cpuset, other than a direct
-ancestor or descendent, may share any of the same CPUs or Memory Nodes.
-A cpuset that is cpu exclusive has a sched domain associated with it.
-The sched domain consists of all cpus in the current cpuset that are not
-part of any exclusive child cpusets.
-This ensures that the scheduler load balacing code only balances
-against the cpus that are in the sched domain as defined above and not
-all of the cpus in the system. This removes any overhead due to
-load balancing code trying to pull tasks outside of the cpu exclusive
-cpuset only to be prevented by the tasks' cpus_allowed mask.
-
-A cpuset that is mem_exclusive restricts kernel allocations for
-page, buffer and other data commonly shared by the kernel across
-multiple users.  All cpusets, whether mem_exclusive or not, restrict
-allocations of memory for user space.  This enables configuring a
-system so that several independent jobs can share common kernel
-data, such as file system pages, while isolating each jobs user
-allocation in its own cpuset.  To do this, construct a large
-mem_exclusive cpuset to hold all the jobs, and construct child,
-non-mem_exclusive cpusets for each individual job.  Only a small
-amount of typical kernel memory, such as requests from interrupt
-handlers, is allowed to be taken outside even a mem_exclusive cpuset.
-
 User level code may create and destroy cpusets by name in the cpuset
 virtual file system, manage the attributes and permissions of these
 cpusets and which CPUs and Memory Nodes are assigned to each cpuset,
@@ -196,6 +177,12 @@ containing the following files describing that cpuset:
  - cpu_exclusive flag: is cpu placement exclusive?
  - mem_exclusive flag: is memory placement exclusive?
  - tasks: list of tasks (by pid) attached to that cpuset
+ - notify_on_release flag: run /sbin/cpuset_release_agent on exit?
+ - marker_pid: pid of user task in co-ordinated operation sequence
+ - memory_pressure: measure of how much paging pressure in cpuset
+
+In addition, the root cpuset only has the following file:
+ - memory_pressure_enabled flag: compute memory_pressure?
 
 New cpusets are created using the mkdir system call or shell
 command.  The properties of a cpuset, such as its flags, allowed
@@ -229,7 +216,148 @@ exclusive cpuset.  Also, the use of a Linux virtual file system (vfs)
 to represent the cpuset hierarchy provides for a familiar permission
 and name space for cpusets, with a minimum of additional kernel code.
 
-1.4 How do I use cpusets ?
+
+1.4 What are exclusive cpusets ?
+--------------------------------
+
+If a cpuset is cpu or mem exclusive, no other cpuset, other than
+a direct ancestor or descendent, may share any of the same CPUs or
+Memory Nodes.
+
+A cpuset that is cpu_exclusive has a scheduler (sched) domain
+associated with it.  The sched domain consists of all CPUs in the
+current cpuset that are not part of any exclusive child cpusets.
+This ensures that the scheduler load balancing code only balances
+against the CPUs that are in the sched domain as defined above and
+not all of the CPUs in the system. This removes any overhead due to
+load balancing code trying to pull tasks outside of the cpu_exclusive
+cpuset only to be prevented by the tasks' cpus_allowed mask.
+
+A cpuset that is mem_exclusive restricts kernel allocations for
+page, buffer and other data commonly shared by the kernel across
+multiple users.  All cpusets, whether mem_exclusive or not, restrict
+allocations of memory for user space.  This enables configuring a
+system so that several independent jobs can share common kernel data,
+such as file system pages, while isolating each jobs user allocation in
+its own cpuset.  To do this, construct a large mem_exclusive cpuset to
+hold all the jobs, and construct child, non-mem_exclusive cpusets for
+each individual job.  Only a small amount of typical kernel memory,
+such as requests from interrupt handlers, is allowed to be taken
+outside even a mem_exclusive cpuset.
+
+
+1.5 What does notify_on_release do ?
+------------------------------------
+
+If the notify_on_release flag is enabled (1) in a cpuset, then whenever
+the last task in the cpuset leaves (exits or attaches to some other
+cpuset) and the last child cpuset of that cpuset is removed, then
+the kernel runs the command /sbin/cpuset_release_agent, supplying the
+pathname (relative to the mount point of the cpuset file system) of the
+abandoned cpuset.  This enables automatic removal of abandoned cpusets.
+The default value of notify_on_release in the root cpuset at system
+boot is disabled (0).  The default value of other cpusets at creation
+is the current value of their parents notify_on_release setting.
+
+
+1.6 What is a marker_pid ?
+--------------------------
+
+The marker_pid helps manage cpuset changes safely from user space.
+
+The interface presented to user space for cpusets uses system wide
+numbering of CPUs and Memory Nodes.   It is the responsibility of
+user level code, presumably in a library, to present cpuset-relative
+numbering to applications when that would be more useful to them.
+
+However if a task is moved to a different cpuset, or if the 'cpus' or
+'mems' of a cpuset are changed, then we need a way for such library
+code to detect that its cpuset-relative numbering has changed, when
+expressed using system wide numbering.
+
+The kernel cannot safely allow user code to lock kernel resources.
+The kernel could deliver out-of-band notice of cpuset changes by
+such mechanisms as signals or usermodehelper callbacks, however
+this can't be synchronously delivered to library code linked in
+applications without intruding on the IPC mechanisms available to
+the app.  The kernel could require user level code to do all the work,
+tracking the cpuset state before and during changes, to verify no
+unexpected change occurred, but this becomes an onerous task.
+
+The "marker_pid" cpuset field provides a simple way to make this task
+less onerous on user library code.  A task writes its pid to a cpusets
+"marker_pid" at the start of a sequence of queries and updates,
+and check as it goes that the cpusets marker_pid doesn't change.
+The pread(2) system call does a seek and read in a single call.
+If the marker_pid changes, the user code should retry the required
+sequence of operations.
+
+Anytime that a task modifies the "cpus" or "mems" of a cpuset,
+unless it's pid is in the cpusets marker_pid field, the kernel zeros
+this field.
+
+The above was inspired by the load linked and store conditional
+(ll/sc) instructions in the MIPS II instruction set.
+
+
+1.7 What is memory_pressure ?
+-----------------------------
+The memory_pressure of a cpuset provides a simple per-cpuset metric
+of the rate that the tasks in a cpuset are attempting to free up in
+use memory on the nodes of the cpuset to satisfy additional memory
+requests.
+
+This enables batch managers monitoring jobs running in dedicated
+cpusets to efficiently detect what level of memory pressure that job
+is causing.
+
+This is useful both on tightly managed systems running a wide mix of
+submitted jobs, which may choose to terminate or re-prioritize jobs that
+are trying to use more memory than allowed on the nodes assigned them,
+and with tightly coupled, long running, massively parallel scientific
+computing jobs that will dramatically fail to meet required performance
+goals if they start to use more memory than allowed to them.
+
+This mechanism provides a very economical way for the batch manager
+to monitor a cpuset for signs of memory pressure.  It's up to the
+batch manager or other user code to decide what to do about it and
+take action.
+
+==> Unless this feature is enabled by writing "1" to the special file
+    /dev/cpuset/memory_pressure_enabled, the hook in the rebalance
+    code of __alloc_pages() for this metric reduces to simply noticing
+    that the cpuset_memory_pressure_enabled flag is zero.  So only
+    systems that enable this feature will compute the metric.
+
+Why a per-cpuset, running average:
+
+    Because this meter is per-cpuset, rather than per-task or mm,
+    the system load imposed by a batch scheduler monitoring this
+    metric is sharply reduced on large systems, because a scan of
+    the tasklist can be avoided on each set of queries.
+
+    Because this meter is a running average, instead of an accumulating
+    counter, a batch scheduler can detect memory pressure with a
+    single read, instead of having to read and accumulate results
+    for a period of time.
+
+    Because this meter is per-cpuset rather than per-task or mm,
+    the batch scheduler can obtain the key information, memory
+    pressure in a cpuset, with a single read, rather than having to
+    query and accumulate results over all the (dynamically changing)
+    set of tasks in the cpuset.
+
+A per-cpuset simple digital filter (requires a spinlock and 3 words
+of data per-cpuset) is kept, and updated by any task attached to that
+cpuset, if it enters the synchronous (direct) page reclaim code.
+
+A per-cpuset file provides an integer number representing the recent
+(half-life of 10 seconds) rate of direct page reclaims caused by
+the tasks in the cpuset, in units of reclaims attempted per second,
+times 1000.
+
+
+1.8 How do I use cpusets ?
 --------------------------
 
 In order to minimize the impact of cpusets on critical kernel
-- 
cgit v1.2.2


From 90c9cc4043fd8454d11886379f841f70e176698e Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:01:51 -0800
Subject: [PATCH] cpuset: remove marker_pid documentation

Remove documentation for the cpuset 'marker_pid' feature, that was in the
patch "cpuset: change marker for relative numbering" That patch was previously
pulled from *-mm at my (pj) request.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/cpusets.txt | 50 ++++-------------------------------------------
 1 file changed, 4 insertions(+), 46 deletions(-)

diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt
index ef83b2dd0c..9e49b1c357 100644
--- a/Documentation/cpusets.txt
+++ b/Documentation/cpusets.txt
@@ -16,9 +16,8 @@ CONTENTS:
   1.3 How are cpusets implemented ?
   1.4 What are exclusive cpusets ?
   1.5 What does notify_on_release do ?
-  1.6 What is a marker_pid ?
-  1.7 What is memory_pressure ?
-  1.8 How do I use cpusets ?
+  1.6 What is memory_pressure ?
+  1.7 How do I use cpusets ?
 2. Usage Examples and Syntax
   2.1 Basic Usage
   2.2 Adding/removing cpus
@@ -178,7 +177,6 @@ containing the following files describing that cpuset:
  - mem_exclusive flag: is memory placement exclusive?
  - tasks: list of tasks (by pid) attached to that cpuset
  - notify_on_release flag: run /sbin/cpuset_release_agent on exit?
- - marker_pid: pid of user task in co-ordinated operation sequence
  - memory_pressure: measure of how much paging pressure in cpuset
 
 In addition, the root cpuset only has the following file:
@@ -260,47 +258,7 @@ boot is disabled (0).  The default value of other cpusets at creation
 is the current value of their parents notify_on_release setting.
 
 
-1.6 What is a marker_pid ?
---------------------------
-
-The marker_pid helps manage cpuset changes safely from user space.
-
-The interface presented to user space for cpusets uses system wide
-numbering of CPUs and Memory Nodes.   It is the responsibility of
-user level code, presumably in a library, to present cpuset-relative
-numbering to applications when that would be more useful to them.
-
-However if a task is moved to a different cpuset, or if the 'cpus' or
-'mems' of a cpuset are changed, then we need a way for such library
-code to detect that its cpuset-relative numbering has changed, when
-expressed using system wide numbering.
-
-The kernel cannot safely allow user code to lock kernel resources.
-The kernel could deliver out-of-band notice of cpuset changes by
-such mechanisms as signals or usermodehelper callbacks, however
-this can't be synchronously delivered to library code linked in
-applications without intruding on the IPC mechanisms available to
-the app.  The kernel could require user level code to do all the work,
-tracking the cpuset state before and during changes, to verify no
-unexpected change occurred, but this becomes an onerous task.
-
-The "marker_pid" cpuset field provides a simple way to make this task
-less onerous on user library code.  A task writes its pid to a cpusets
-"marker_pid" at the start of a sequence of queries and updates,
-and check as it goes that the cpusets marker_pid doesn't change.
-The pread(2) system call does a seek and read in a single call.
-If the marker_pid changes, the user code should retry the required
-sequence of operations.
-
-Anytime that a task modifies the "cpus" or "mems" of a cpuset,
-unless it's pid is in the cpusets marker_pid field, the kernel zeros
-this field.
-
-The above was inspired by the load linked and store conditional
-(ll/sc) instructions in the MIPS II instruction set.
-
-
-1.7 What is memory_pressure ?
+1.6 What is memory_pressure ?
 -----------------------------
 The memory_pressure of a cpuset provides a simple per-cpuset metric
 of the rate that the tasks in a cpuset are attempting to free up in
@@ -357,7 +315,7 @@ the tasks in the cpuset, in units of reclaims attempted per second,
 times 1000.
 
 
-1.8 How do I use cpusets ?
+1.7 How do I use cpusets ?
 --------------------------
 
 In order to minimize the impact of cpusets on critical kernel
-- 
cgit v1.2.2


From c5b2aff89635495064592dc90da595f8a880ee87 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:01:51 -0800
Subject: [PATCH] cpuset: minor spacing initializer fixes

Four trivial cpuset fixes: remove extra spaces, remove useless initializers,
mark one __read_mostly.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 5a06fef669..f513dd937e 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -54,7 +54,7 @@
 #include <asm/atomic.h>
 #include <asm/semaphore.h>
 
-#define CPUSET_SUPER_MAGIC 		0x27e0eb
+#define CPUSET_SUPER_MAGIC		0x27e0eb
 
 /* See "Frequency meter" comments, below. */
 
@@ -154,9 +154,6 @@ static struct cpuset top_cpuset = {
 	.count = ATOMIC_INIT(0),
 	.sibling = LIST_HEAD_INIT(top_cpuset.sibling),
 	.children = LIST_HEAD_INIT(top_cpuset.children),
-	.parent = NULL,
-	.dentry = NULL,
-	.mems_generation = 0,
 };
 
 static struct vfsmount *cpuset_mount;
@@ -1341,7 +1338,7 @@ static int cpuset_create_file(struct dentry *dentry, int mode)
 
 /*
  *	cpuset_create_dir - create a directory for an object.
- *	cs: 	the cpuset we create the directory for.
+ *	cs:	the cpuset we create the directory for.
  *		It must have a valid ->parent field
  *		And we are going to fill its ->dentry field.
  *	name:	The name to give to the cpuset directory. Will be copied.
@@ -2049,7 +2046,7 @@ done:
  * cpuset file 'memory_pressure_enabled' in the root cpuset.
  */
 
-int cpuset_memory_pressure_enabled;
+int cpuset_memory_pressure_enabled __read_mostly;
 
 /**
  * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
-- 
cgit v1.2.2


From 59dac16fb95f09253b8086134443abeb439703cd Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:01:52 -0800
Subject: [PATCH] cpuset: update_nodemask code reformat

Restructure code layout of the kernel/cpuset.c update_nodemask() routine,
removing embedded returns and nested if's in favor of goto completion labels.
This is being done in anticipation of adding more logic to this routine, which
will favor the goto style structure.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f513dd937e..3ea63da11d 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -799,18 +799,23 @@ static int update_nodemask(struct cpuset *cs, char *buf)
 	trialcs = *cs;
 	retval = nodelist_parse(buf, trialcs.mems_allowed);
 	if (retval < 0)
-		return retval;
+		goto done;
 	nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, node_online_map);
-	if (nodes_empty(trialcs.mems_allowed))
-		return -ENOSPC;
-	retval = validate_change(cs, &trialcs);
-	if (retval == 0) {
-		down(&callback_sem);
-		cs->mems_allowed = trialcs.mems_allowed;
-		atomic_inc(&cpuset_mems_generation);
-		cs->mems_generation = atomic_read(&cpuset_mems_generation);
-		up(&callback_sem);
+	if (nodes_empty(trialcs.mems_allowed)) {
+		retval = -ENOSPC;
+		goto done;
 	}
+	retval = validate_change(cs, &trialcs);
+	if (retval < 0)
+		goto done;
+
+	down(&callback_sem);
+	cs->mems_allowed = trialcs.mems_allowed;
+	atomic_inc(&cpuset_mems_generation);
+	cs->mems_generation = atomic_read(&cpuset_mems_generation);
+	up(&callback_sem);
+
+done:
 	return retval;
 }
 
-- 
cgit v1.2.2


From b4b2641843db124637fa3d2cb2101982035dcc82 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:01:53 -0800
Subject: [PATCH] cpuset: fork hook fix

Fix obscure, never seen in real life, cpuset fork race.  The cpuset_fork()
call in fork.c was setting up the correct task->cpuset pointer after the
tasklist_lock was dropped, which briefly exposed the newly forked process with
an unsafe (copied from parent without locks or usage counter increment) cpuset
pointer.

In theory, that exposed cpuset pointer could have been pointing at a cpuset
that was already freed and removed, and in theory another task that had been
sitting on the tasklist_lock waiting to scan the task list could have raced
down the entire tasklist, found our new child at the far end, and dereferenced
that bogus cpuset pointer.

To fix, setup up the correct cpuset pointer in the new child by calling
cpuset_fork() before the new task is linked into the tasklist, and with that,
add a fork failure case, to dereference that cpuset, if the fork fails along
the way, after cpuset_fork() was called.

Had to remove a BUG_ON() from cpuset_exit(), because it was no longer valid -
the call to cpuset_exit() from a failed fork would not have PF_EXITING set.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 4 +---
 kernel/fork.c   | 6 ++++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 3ea63da11d..d9349cc48b 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1821,15 +1821,13 @@ void cpuset_fork(struct task_struct *child)
  *
  * We don't need to task_lock() this reference to tsk->cpuset,
  * because tsk is already marked PF_EXITING, so attach_task() won't
- * mess with it.
+ * mess with it, or task is a failed fork, never visible to attach_task.
  **/
 
 void cpuset_exit(struct task_struct *tsk)
 {
 	struct cpuset *cs;
 
-	BUG_ON(!(tsk->flags & PF_EXITING));
-
 	cs = tsk->cpuset;
 	tsk->cpuset = NULL;
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 7fe3adfa65..7992ee759d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -972,12 +972,13 @@ static task_t *copy_process(unsigned long clone_flags,
 	p->io_context = NULL;
 	p->io_wait = NULL;
 	p->audit_context = NULL;
+	cpuset_fork(p);
 #ifdef CONFIG_NUMA
  	p->mempolicy = mpol_copy(p->mempolicy);
  	if (IS_ERR(p->mempolicy)) {
  		retval = PTR_ERR(p->mempolicy);
  		p->mempolicy = NULL;
- 		goto bad_fork_cleanup;
+ 		goto bad_fork_cleanup_cpuset;
  	}
 #endif
 
@@ -1148,7 +1149,6 @@ static task_t *copy_process(unsigned long clone_flags,
 	total_forks++;
 	write_unlock_irq(&tasklist_lock);
 	proc_fork_connector(p);
-	cpuset_fork(p);
 	retval = 0;
 
 fork_out:
@@ -1180,7 +1180,9 @@ bad_fork_cleanup_security:
 bad_fork_cleanup_policy:
 #ifdef CONFIG_NUMA
 	mpol_free(p->mempolicy);
+bad_fork_cleanup_cpuset:
 #endif
+	cpuset_exit(p);
 bad_fork_cleanup:
 	if (p->binfmt)
 		module_put(p->binfmt->module);
-- 
cgit v1.2.2


From cf2a473c4089aa41c26f653200673f5a4cc25047 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:01:54 -0800
Subject: [PATCH] cpuset: combine refresh_mems and update_mems

The important code paths through alloc_pages_current() and alloc_page_vma(),
by which most kernel page allocations go, both called
cpuset_update_current_mems_allowed(), which in turn called refresh_mems().
-Both- of these latter two routines did a tasklock, got the tasks cpuset
pointer, and checked for out of date cpuset->mems_generation.

That was a silly duplication of code and waste of CPU cycles on an important
code path.

Consolidated those two routines into a single routine, called
cpuset_update_task_memory_state(), since it updates more than just
mems_allowed.

Changed all callers of either routine to call the new consolidated routine.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/cpuset.h |  4 +--
 kernel/cpuset.c        | 95 ++++++++++++++++++++++----------------------------
 mm/mempolicy.c         | 10 +++---
 3 files changed, 48 insertions(+), 61 deletions(-)

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 736d73801c..1feebf16ab 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -20,7 +20,7 @@ extern void cpuset_fork(struct task_struct *p);
 extern void cpuset_exit(struct task_struct *p);
 extern cpumask_t cpuset_cpus_allowed(const struct task_struct *p);
 void cpuset_init_current_mems_allowed(void);
-void cpuset_update_current_mems_allowed(void);
+void cpuset_update_task_memory_state(void);
 #define cpuset_nodes_subset_current_mems_allowed(nodes) \
 		nodes_subset((nodes), current->mems_allowed)
 int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl);
@@ -51,7 +51,7 @@ static inline cpumask_t cpuset_cpus_allowed(struct task_struct *p)
 }
 
 static inline void cpuset_init_current_mems_allowed(void) {}
-static inline void cpuset_update_current_mems_allowed(void) {}
+static inline void cpuset_update_task_memory_state(void) {}
 #define cpuset_nodes_subset_current_mems_allowed(nodes) (1)
 
 static inline int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index d9349cc48b..e9917d7162 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -584,13 +584,26 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
 	BUG_ON(!nodes_intersects(*pmask, node_online_map));
 }
 
-/*
- * Refresh current tasks mems_allowed and mems_generation from current
- * tasks cpuset.
+/**
+ * cpuset_update_task_memory_state - update task memory placement
  *
- * Call without callback_sem or task_lock() held.  May be called with
- * or without manage_sem held.  Will acquire task_lock() and might
- * acquire callback_sem during call.
+ * If the current tasks cpusets mems_allowed changed behind our
+ * backs, update current->mems_allowed, mems_generation and task NUMA
+ * mempolicy to the new value.
+ *
+ * Task mempolicy is updated by rebinding it relative to the
+ * current->cpuset if a task has its memory placement changed.
+ * Do not call this routine if in_interrupt().
+ *
+ * Call without callback_sem or task_lock() held.  May be called
+ * with or without manage_sem held.  Except in early boot or
+ * an exiting task, when tsk->cpuset is NULL, this routine will
+ * acquire task_lock().  We don't need to use task_lock to guard
+ * against another task changing a non-NULL cpuset pointer to NULL,
+ * as that is only done by a task on itself, and if the current task
+ * is here, it is not simultaneously in the exit code NULL'ing its
+ * cpuset pointer.  This routine also might acquire callback_sem and
+ * current->mm->mmap_sem during call.
  *
  * The task_lock() is required to dereference current->cpuset safely.
  * Without it, we could pick up the pointer value of current->cpuset
@@ -605,32 +618,36 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
  * task has been modifying its cpuset.
  */
 
-static void refresh_mems(void)
+void cpuset_update_task_memory_state()
 {
 	int my_cpusets_mem_gen;
+	struct task_struct *tsk = current;
+	struct cpuset *cs = tsk->cpuset;
 
-	task_lock(current);
-	my_cpusets_mem_gen = current->cpuset->mems_generation;
-	task_unlock(current);
+	if (unlikely(!cs))
+		return;
+
+	task_lock(tsk);
+	my_cpusets_mem_gen = cs->mems_generation;
+	task_unlock(tsk);
 
-	if (current->cpuset_mems_generation != my_cpusets_mem_gen) {
-		struct cpuset *cs;
-		nodemask_t oldmem = current->mems_allowed;
+	if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
+		nodemask_t oldmem = tsk->mems_allowed;
 		int migrate;
 
 		down(&callback_sem);
-		task_lock(current);
-		cs = current->cpuset;
+		task_lock(tsk);
+		cs = tsk->cpuset;	/* Maybe changed when task not locked */
 		migrate = is_memory_migrate(cs);
-		guarantee_online_mems(cs, &current->mems_allowed);
-		current->cpuset_mems_generation = cs->mems_generation;
-		task_unlock(current);
+		guarantee_online_mems(cs, &tsk->mems_allowed);
+		tsk->cpuset_mems_generation = cs->mems_generation;
+		task_unlock(tsk);
 		up(&callback_sem);
-		if (!nodes_equal(oldmem, current->mems_allowed)) {
-			numa_policy_rebind(&oldmem, &current->mems_allowed);
+		numa_policy_rebind(&oldmem, &tsk->mems_allowed);
+		if (!nodes_equal(oldmem, tsk->mems_allowed)) {
 			if (migrate) {
-				do_migrate_pages(current->mm, &oldmem,
-					&current->mems_allowed,
+				do_migrate_pages(tsk->mm, &oldmem,
+					&tsk->mems_allowed,
 					MPOL_MF_MOVE_ALL);
 			}
 		}
@@ -1630,7 +1647,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
 		return -ENOMEM;
 
 	down(&manage_sem);
-	refresh_mems();
+	cpuset_update_task_memory_state();
 	cs->flags = 0;
 	if (notify_on_release(parent))
 		set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
@@ -1688,7 +1705,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
 	/* the vfs holds both inode->i_sem already */
 
 	down(&manage_sem);
-	refresh_mems();
+	cpuset_update_task_memory_state();
 	if (atomic_read(&cs->count) > 0) {
 		up(&manage_sem);
 		return -EBUSY;
@@ -1872,36 +1889,6 @@ void cpuset_init_current_mems_allowed(void)
 	current->mems_allowed = NODE_MASK_ALL;
 }
 
-/**
- * cpuset_update_current_mems_allowed - update mems parameters to new values
- *
- * If the current tasks cpusets mems_allowed changed behind our backs,
- * update current->mems_allowed and mems_generation to the new value.
- * Do not call this routine if in_interrupt().
- *
- * Call without callback_sem or task_lock() held.  May be called
- * with or without manage_sem held.  Unless exiting, it will acquire
- * task_lock().  Also might acquire callback_sem during call to
- * refresh_mems().
- */
-
-void cpuset_update_current_mems_allowed(void)
-{
-	struct cpuset *cs;
-	int need_to_refresh = 0;
-
-	task_lock(current);
-	cs = current->cpuset;
-	if (!cs)
-		goto done;
-	if (current->cpuset_mems_generation != cs->mems_generation)
-		need_to_refresh = 1;
-done:
-	task_unlock(current);
-	if (need_to_refresh)
-		refresh_mems();
-}
-
 /**
  * cpuset_zonelist_valid_mems_allowed - check zonelist vs. curremt mems_allowed
  * @zl: the zonelist to be checked
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 9dea2b8a7d..515bfeee02 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -387,7 +387,7 @@ static int contextualize_policy(int mode, nodemask_t *nodes)
 	if (!nodes)
 		return 0;
 
-	cpuset_update_current_mems_allowed();
+	cpuset_update_task_memory_state();
 	if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
 		return -EINVAL;
 	return mpol_check_policy(mode, nodes);
@@ -461,7 +461,7 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
 	struct vm_area_struct *vma = NULL;
 	struct mempolicy *pol = current->mempolicy;
 
-	cpuset_update_current_mems_allowed();
+	cpuset_update_task_memory_state();
 	if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
 		return -EINVAL;
 	if (flags & MPOL_F_ADDR) {
@@ -1089,7 +1089,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 {
 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
 
-	cpuset_update_current_mems_allowed();
+	cpuset_update_task_memory_state();
 
 	if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
 		unsigned nid;
@@ -1115,7 +1115,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
  *	interrupt context and apply the current process NUMA policy.
  *	Returns NULL when no page can be allocated.
  *
- *	Don't call cpuset_update_current_mems_allowed() unless
+ *	Don't call cpuset_update_task_memory_state() unless
  *	1) it's ok to take cpuset_sem (can WAIT), and
  *	2) allocating for current task (not interrupt).
  */
@@ -1124,7 +1124,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 	struct mempolicy *pol = current->mempolicy;
 
 	if ((gfp & __GFP_WAIT) && !in_interrupt())
-		cpuset_update_current_mems_allowed();
+		cpuset_update_task_memory_state();
 	if (!pol || in_interrupt())
 		pol = &default_policy;
 	if (pol->policy == MPOL_INTERLEAVE)
-- 
cgit v1.2.2


From 909d75a3b77bdd8baa9429bad3b69a654d2954ce Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:01:55 -0800
Subject: [PATCH] cpuset: implement cpuset_mems_allowed

Provide a cpuset_mems_allowed() method, which the sys_migrate_pages() code
needed, to obtain the mems_allowed vector of a cpuset, and replaced the
workaround in sys_migrate_pages() to call this new method.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/cpuset.h |  8 +++++++-
 kernel/cpuset.c        | 29 ++++++++++++++++++++++++++---
 mm/mempolicy.c         |  3 ---
 3 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 1feebf16ab..37d2dd7ca3 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -18,7 +18,8 @@ extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
 extern void cpuset_fork(struct task_struct *p);
 extern void cpuset_exit(struct task_struct *p);
-extern cpumask_t cpuset_cpus_allowed(const struct task_struct *p);
+extern cpumask_t cpuset_cpus_allowed(struct task_struct *p);
+extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
 void cpuset_init_current_mems_allowed(void);
 void cpuset_update_task_memory_state(void);
 #define cpuset_nodes_subset_current_mems_allowed(nodes) \
@@ -50,6 +51,11 @@ static inline cpumask_t cpuset_cpus_allowed(struct task_struct *p)
 	return cpu_possible_map;
 }
 
+static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
+{
+	return node_possible_map;
+}
+
 static inline void cpuset_init_current_mems_allowed(void) {}
 static inline void cpuset_update_task_memory_state(void) {}
 #define cpuset_nodes_subset_current_mems_allowed(nodes) (1)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index e9917d7162..0d0dbbd656 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1871,14 +1871,14 @@ void cpuset_exit(struct task_struct *tsk)
  * tasks cpuset.
  **/
 
-cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk)
+cpumask_t cpuset_cpus_allowed(struct task_struct *tsk)
 {
 	cpumask_t mask;
 
 	down(&callback_sem);
-	task_lock((struct task_struct *)tsk);
+	task_lock(tsk);
 	guarantee_online_cpus(tsk->cpuset, &mask);
-	task_unlock((struct task_struct *)tsk);
+	task_unlock(tsk);
 	up(&callback_sem);
 
 	return mask;
@@ -1889,6 +1889,29 @@ void cpuset_init_current_mems_allowed(void)
 	current->mems_allowed = NODE_MASK_ALL;
 }
 
+/**
+ * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset.
+ * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed.
+ *
+ * Description: Returns the nodemask_t mems_allowed of the cpuset
+ * attached to the specified @tsk.  Guaranteed to return some non-empty
+ * subset of node_online_map, even if this means going outside the
+ * tasks cpuset.
+ **/
+
+nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
+{
+	nodemask_t mask;
+
+	down(&callback_sem);
+	task_lock(tsk);
+	guarantee_online_mems(tsk->cpuset, &mask);
+	task_unlock(tsk);
+	up(&callback_sem);
+
+	return mask;
+}
+
 /**
  * cpuset_zonelist_valid_mems_allowed - check zonelist vs. curremt mems_allowed
  * @zl: the zonelist to be checked
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 515bfeee02..34d566ac14 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -772,9 +772,6 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 	return do_set_mempolicy(mode, &nodes);
 }
 
-/* Macro needed until Paul implements this function in kernel/cpusets.c */
-#define cpuset_mems_allowed(task) node_online_map
-
 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 		const unsigned long __user *old_nodes,
 		const unsigned long __user *new_nodes)
-- 
cgit v1.2.2


From 74cb21553f4bf244185b9bec4c26e4e3169ad55e Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:01:56 -0800
Subject: [PATCH] cpuset: numa_policy_rebind cleanup

Cleanup, reorganize and make more robust the mempolicy.c code to rebind
mempolicies relative to the containing cpuset after a tasks memory placement
changes.

The real motivator for this cleanup patch is to lay more groundwork for the
upcoming patch to correctly rebind NUMA mempolicies that are attached to vma's
after the containing cpuset memory placement changes.

NUMA mempolicies are constrained by the cpuset their task is a member of.
When either (1) a task is moved to a different cpuset, or (2) the 'mems'
mems_allowed of a cpuset is changed, then the NUMA mempolicies have embedded
node numbers (for MPOL_BIND, MPOL_INTERLEAVE and MPOL_PREFERRED) that need to
be recalculated, relative to their new cpuset placement.

The old code used an unreliable method of determining what was the old
mems_allowed constraining the mempolicy.  It just looked at the tasks
mems_allowed value.  This sort of worked with the present code, that just
rebinds the -task- mempolicy, and leaves any -vma- mempolicies broken,
referring to the old nodes.  But in an upcoming patch, the vma mempolicies
will be rebound as well.  Then the order in which the various task and vma
mempolicies are updated will no longer be deterministic, and one can no longer
count on the task->mems_allowed holding the old value for as long as needed.
It's not even clear if the current code was guaranteed to work reliably for
task mempolicies.

So I added a mems_allowed field to each mempolicy, stating exactly what
mems_allowed the policy is relative to, and updated synchronously and reliably
anytime that the mempolicy is rebound.

Also removed a useless wrapper routine, numa_policy_rebind(), and had its
caller, cpuset_update_task_memory_state(), call directly to the rewritten
policy_rebind() routine, and made that rebind routine extern instead of
static, and added a "mpol_" prefix to its name, making it
mpol_rebind_policy().

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mempolicy.h | 12 ++++++++++--
 kernel/cpuset.c           |  2 +-
 mm/mempolicy.c            | 31 +++++++++++++++++++------------
 3 files changed, 30 insertions(+), 15 deletions(-)

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 05fddd5bee..74357cb9bc 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -68,6 +68,7 @@ struct mempolicy {
 		nodemask_t	 nodes;		/* interleave */
 		/* undefined for default */
 	} v;
+	nodemask_t cpuset_mems_allowed;	/* mempolicy relative to these nodes */
 };
 
 /*
@@ -146,7 +147,9 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
 
 extern void numa_default_policy(void);
 extern void numa_policy_init(void);
-extern void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new);
+extern void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *new);
+extern void mpol_rebind_task(struct task_struct *tsk,
+					const nodemask_t *new);
 extern struct mempolicy default_policy;
 extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
 		unsigned long addr);
@@ -221,7 +224,12 @@ static inline void numa_default_policy(void)
 {
 }
 
-static inline void numa_policy_rebind(const nodemask_t *old,
+static inline void mpol_rebind_policy(struct mempolicy *pol,
+					const nodemask_t *new)
+{
+}
+
+static inline void mpol_rebind_task(struct task_struct *tsk,
 					const nodemask_t *new)
 {
 }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 0d0dbbd656..8f764de3a9 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -643,7 +643,7 @@ void cpuset_update_task_memory_state()
 		tsk->cpuset_mems_generation = cs->mems_generation;
 		task_unlock(tsk);
 		up(&callback_sem);
-		numa_policy_rebind(&oldmem, &tsk->mems_allowed);
+		mpol_rebind_task(tsk, &tsk->mems_allowed);
 		if (!nodes_equal(oldmem, tsk->mems_allowed)) {
 			if (migrate) {
 				do_migrate_pages(tsk->mm, &oldmem,
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 34d566ac14..c39bd86f4e 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -180,6 +180,7 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 		break;
 	}
 	policy->policy = mode;
+	policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
 	return policy;
 }
 
@@ -1411,25 +1412,31 @@ void numa_default_policy(void)
 }
 
 /* Migrate a policy to a different set of nodes */
-static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
-							const nodemask_t *new)
+void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
 {
+	nodemask_t *mpolmask;
 	nodemask_t tmp;
 
 	if (!pol)
 		return;
+	mpolmask = &pol->cpuset_mems_allowed;
+	if (nodes_equal(*mpolmask, *newmask))
+		return;
 
 	switch (pol->policy) {
 	case MPOL_DEFAULT:
 		break;
 	case MPOL_INTERLEAVE:
-		nodes_remap(tmp, pol->v.nodes, *old, *new);
+		nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
 		pol->v.nodes = tmp;
-		current->il_next = node_remap(current->il_next, *old, *new);
+		*mpolmask = *newmask;
+		current->il_next = node_remap(current->il_next,
+						*mpolmask, *newmask);
 		break;
 	case MPOL_PREFERRED:
 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
-								*old, *new);
+						*mpolmask, *newmask);
+		*mpolmask = *newmask;
 		break;
 	case MPOL_BIND: {
 		nodemask_t nodes;
@@ -1439,7 +1446,7 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
 		nodes_clear(nodes);
 		for (z = pol->v.zonelist->zones; *z; z++)
 			node_set((*z)->zone_pgdat->node_id, nodes);
-		nodes_remap(tmp, nodes, *old, *new);
+		nodes_remap(tmp, nodes, *mpolmask, *newmask);
 		nodes = tmp;
 
 		zonelist = bind_zonelist(&nodes);
@@ -1454,6 +1461,7 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
 			kfree(pol->v.zonelist);
 			pol->v.zonelist = zonelist;
 		}
+		*mpolmask = *newmask;
 		break;
 	}
 	default:
@@ -1463,14 +1471,13 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
 }
 
 /*
- * Someone moved this task to different nodes.  Fixup mempolicies.
- *
- * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well,
- * once we have a cpuset mechanism to mark which cpuset subtree is migrating.
+ * Wrapper for mpol_rebind_policy() that just requires task
+ * pointer, and updates task mempolicy.
  */
-void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new)
+
+void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
 {
-	rebind_policy(current->mempolicy, old, new);
+	mpol_rebind_policy(tsk->mempolicy, new);
 }
 
 /*
-- 
cgit v1.2.2


From 202f72d5d1b5c2c084f63ef996c736d208b447b5 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:01:57 -0800
Subject: [PATCH] cpuset: number_of_cpusets optimization

Easy little optimization hack to avoid actually having to call
cpuset_zone_allowed() and check mems_allowed, in the main page allocation
routine, __alloc_pages().  This saves several CPU cycles per page allocation
on systems not using cpusets.

A counter is updated each time a cpuset is created or removed, and whenever
there is only one cpuset in the system, it must be the root cpuset, which
contains all CPUs and all Memory Nodes.  In that case, when the counter is
one, all allocations are allowed.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/cpuset.h | 10 +++++++++-
 kernel/cpuset.c        | 12 +++++++++++-
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 37d2dd7ca3..34081c168a 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -14,6 +14,8 @@
 
 #ifdef CONFIG_CPUSETS
 
+extern int number_of_cpusets;	/* How many cpusets are defined in system? */
+
 extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
 extern void cpuset_fork(struct task_struct *p);
@@ -25,7 +27,13 @@ void cpuset_update_task_memory_state(void);
 #define cpuset_nodes_subset_current_mems_allowed(nodes) \
 		nodes_subset((nodes), current->mems_allowed)
 int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl);
-extern int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask);
+
+extern int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask);
+static int inline cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
+{
+	return number_of_cpusets <= 1 || __cpuset_zone_allowed(z, gfp_mask);
+}
+
 extern int cpuset_excl_nodes_overlap(const struct task_struct *p);
 
 #define cpuset_memory_pressure_bump() 				\
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 8f764de3a9..6004719f26 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -56,6 +56,13 @@
 
 #define CPUSET_SUPER_MAGIC		0x27e0eb
 
+/*
+ * Tracks how many cpusets are currently defined in system.
+ * When there is only one cpuset (the root cpuset) we can
+ * short circuit some hooks.
+ */
+int number_of_cpusets;
+
 /* See "Frequency meter" comments, below. */
 
 struct fmeter {
@@ -1664,6 +1671,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
 
 	down(&callback_sem);
 	list_add(&cs->sibling, &cs->parent->children);
+	number_of_cpusets++;
 	up(&callback_sem);
 
 	err = cpuset_create_dir(cs, name, mode);
@@ -1726,6 +1734,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
 	spin_unlock(&d->d_lock);
 	cpuset_d_remove_dir(d);
 	dput(d);
+	number_of_cpusets--;
 	up(&callback_sem);
 	if (list_empty(&parent->children))
 		check_for_release(parent, &pathbuf);
@@ -1769,6 +1778,7 @@ int __init cpuset_init(void)
 	root->d_inode->i_nlink++;
 	top_cpuset.dentry = root;
 	root->d_inode->i_op = &cpuset_dir_inode_operations;
+	number_of_cpusets = 1;
 	err = cpuset_populate_dir(root);
 	/* memory_pressure_enabled is in root cpuset only */
 	if (err == 0)
@@ -1982,7 +1992,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
  *	GFP_USER     - only nodes in current tasks mems allowed ok.
  **/
 
-int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
+int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
 {
 	int node;			/* node that zone z is on */
 	const struct cpuset *cs;	/* current cpuset ancestors */
-- 
cgit v1.2.2


From 4225399a66b315d4d1fb1cb61b75dda201c832e3 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:01:59 -0800
Subject: [PATCH] cpuset: rebind vma mempolicies fix

Fix more of longstanding bug in cpuset/mempolicy interaction.

NUMA mempolicies (mm/mempolicy.c) are constrained by the current tasks cpuset
to just the Memory Nodes allowed by that cpuset.  The kernel maintains
internal state for each mempolicy, tracking what nodes are used for the
MPOL_INTERLEAVE, MPOL_BIND or MPOL_PREFERRED policies.

When a tasks cpuset memory placement changes, whether because the cpuset
changed, or because the task was attached to a different cpuset, then the
tasks mempolicies have to be rebound to the new cpuset placement, so as to
preserve the cpuset-relative numbering of the nodes in that policy.

An earlier fix handled such mempolicy rebinding for mempolicies attached to a
task.

This fix rebinds mempolicies attached to vma's (address ranges in a tasks
address space.) Due to the need to hold the task->mm->mmap_sem semaphore while
updating vma's, the rebinding of vma mempolicies has to be done when the
cpuset memory placement is changed, at which time mmap_sem can be safely
acquired.  The tasks mempolicy is rebound later, when the task next attempts
to allocate memory and notices that its task->cpuset_mems_generation is
out-of-date with its cpusets mems_generation.

Because walking the tasklist to find all tasks attached to a changing cpuset
requires holding tasklist_lock, a spinlock, one cannot update the vma's of the
affected tasks while doing the tasklist scan.  In general, one cannot acquire
a semaphore (which can sleep) while already holding a spinlock (such as
tasklist_lock).  So a list of mm references has to be built up during the
tasklist scan, then the tasklist lock dropped, then for each mm, its mmap_sem
acquired, and the vma's in that mm rebound.

Once the tasklist lock is dropped, affected tasks may fork new tasks, before
their mm's are rebound.  A kernel global 'cpuset_being_rebound' is set to
point to the cpuset being rebound (there can only be one; cpuset modifications
are done under a global 'manage_sem' semaphore), and the mpol_copy code that
is used to copy a tasks mempolicies during fork catches such forking tasks,
and ensures their children are also rebound.

When a task is moved to a different cpuset, it is easier, as there is only one
task involved.  It's mm->vma's are scanned, using the same
mpol_rebind_policy() as used above.

It may happen that both the mpol_copy hook and the update done via the
tasklist scan update the same mm twice.  This is ok, as the mempolicies of
each vma in an mm keep track of what mems_allowed they are relative to, and
safely no-op a second request to rebind to the same nodes.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mempolicy.h | 18 ++++++++++
 kernel/cpuset.c           | 90 +++++++++++++++++++++++++++++++++++++++++++++++
 mm/mempolicy.c            | 29 +++++++++++++++
 3 files changed, 137 insertions(+)

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 74357cb9bc..c7ac77e873 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -150,6 +150,16 @@ extern void numa_policy_init(void);
 extern void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *new);
 extern void mpol_rebind_task(struct task_struct *tsk,
 					const nodemask_t *new);
+extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);
+#define set_cpuset_being_rebound(x) (cpuset_being_rebound = (x))
+
+#ifdef CONFIG_CPUSET
+#define current_cpuset_is_being_rebound() \
+				(cpuset_being_rebound == current->cpuset)
+#else
+#define current_cpuset_is_being_rebound() 0
+#endif
+
 extern struct mempolicy default_policy;
 extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
 		unsigned long addr);
@@ -165,6 +175,8 @@ static inline void check_highest_zone(int k)
 int do_migrate_pages(struct mm_struct *mm,
 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags);
 
+extern void *cpuset_being_rebound;	/* Trigger mpol_copy vma rebind */
+
 #else
 
 struct mempolicy {};
@@ -234,6 +246,12 @@ static inline void mpol_rebind_task(struct task_struct *tsk,
 {
 }
 
+static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
+{
+}
+
+#define set_cpuset_being_rebound(x) do {} while (0)
+
 static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
 		unsigned long addr)
 {
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 6004719f26..19f87565be 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -812,12 +812,24 @@ static int update_cpumask(struct cpuset *cs, char *buf)
 }
 
 /*
+ * Handle user request to change the 'mems' memory placement
+ * of a cpuset.  Needs to validate the request, update the
+ * cpusets mems_allowed and mems_generation, and for each
+ * task in the cpuset, rebind any vma mempolicies.
+ *
  * Call with manage_sem held.  May take callback_sem during call.
+ * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
+ * lock each such tasks mm->mmap_sem, scan its vma's and rebind
+ * their mempolicies to the cpusets new mems_allowed.
  */
 
 static int update_nodemask(struct cpuset *cs, char *buf)
 {
 	struct cpuset trialcs;
+	struct task_struct *g, *p;
+	struct mm_struct **mmarray;
+	int i, n, ntasks;
+	int fudge;
 	int retval;
 
 	trialcs = *cs;
@@ -839,6 +851,76 @@ static int update_nodemask(struct cpuset *cs, char *buf)
 	cs->mems_generation = atomic_read(&cpuset_mems_generation);
 	up(&callback_sem);
 
+	set_cpuset_being_rebound(cs);		/* causes mpol_copy() rebind */
+
+	fudge = 10;				/* spare mmarray[] slots */
+	fudge += cpus_weight(cs->cpus_allowed);	/* imagine one fork-bomb/cpu */
+	retval = -ENOMEM;
+
+	/*
+	 * Allocate mmarray[] to hold mm reference for each task
+	 * in cpuset cs.  Can't kmalloc GFP_KERNEL while holding
+	 * tasklist_lock.  We could use GFP_ATOMIC, but with a
+	 * few more lines of code, we can retry until we get a big
+	 * enough mmarray[] w/o using GFP_ATOMIC.
+	 */
+	while (1) {
+		ntasks = atomic_read(&cs->count);	/* guess */
+		ntasks += fudge;
+		mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
+		if (!mmarray)
+			goto done;
+		write_lock_irq(&tasklist_lock);		/* block fork */
+		if (atomic_read(&cs->count) <= ntasks)
+			break;				/* got enough */
+		write_unlock_irq(&tasklist_lock);	/* try again */
+		kfree(mmarray);
+	}
+
+	n = 0;
+
+	/* Load up mmarray[] with mm reference for each task in cpuset. */
+	do_each_thread(g, p) {
+		struct mm_struct *mm;
+
+		if (n >= ntasks) {
+			printk(KERN_WARNING
+				"Cpuset mempolicy rebind incomplete.\n");
+			continue;
+		}
+		if (p->cpuset != cs)
+			continue;
+		mm = get_task_mm(p);
+		if (!mm)
+			continue;
+		mmarray[n++] = mm;
+	} while_each_thread(g, p);
+	write_unlock_irq(&tasklist_lock);
+
+	/*
+	 * Now that we've dropped the tasklist spinlock, we can
+	 * rebind the vma mempolicies of each mm in mmarray[] to their
+	 * new cpuset, and release that mm.  The mpol_rebind_mm()
+	 * call takes mmap_sem, which we couldn't take while holding
+	 * tasklist_lock.  Forks can happen again now - the mpol_copy()
+	 * cpuset_being_rebound check will catch such forks, and rebind
+	 * their vma mempolicies too.  Because we still hold the global
+	 * cpuset manage_sem, we know that no other rebind effort will
+	 * be contending for the global variable cpuset_being_rebound.
+	 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
+	 * is idempotent.
+	 */
+	for (i = 0; i < n; i++) {
+		struct mm_struct *mm = mmarray[i];
+
+		mpol_rebind_mm(mm, &cs->mems_allowed);
+		mmput(mm);
+	}
+
+	/* We're done rebinding vma's to this cpusets new mems_allowed. */
+	kfree(mmarray);
+	set_cpuset_being_rebound(NULL);
+	retval = 0;
 done:
 	return retval;
 }
@@ -1011,6 +1093,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
 	struct cpuset *oldcs;
 	cpumask_t cpus;
 	nodemask_t from, to;
+	struct mm_struct *mm;
 
 	if (sscanf(pidbuf, "%d", &pid) != 1)
 		return -EIO;
@@ -1060,6 +1143,13 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
 	to = cs->mems_allowed;
 
 	up(&callback_sem);
+
+	mm = get_task_mm(tsk);
+	if (mm) {
+		mpol_rebind_mm(mm, &to);
+		mmput(mm);
+	}
+
 	if (is_memory_migrate(cs))
 		do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL);
 	put_task_struct(tsk);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index c39bd86f4e..1850d0aef4 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1131,6 +1131,15 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 }
 EXPORT_SYMBOL(alloc_pages_current);
 
+/*
+ * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
+ * rebinds the mempolicy its copying by calling mpol_rebind_policy()
+ * with the mems_allowed returned by cpuset_mems_allowed().  This
+ * keeps mempolicies cpuset relative after its cpuset moves.  See
+ * further kernel/cpuset.c update_nodemask().
+ */
+void *cpuset_being_rebound;
+
 /* Slow path of a mempolicy copy */
 struct mempolicy *__mpol_copy(struct mempolicy *old)
 {
@@ -1138,6 +1147,10 @@ struct mempolicy *__mpol_copy(struct mempolicy *old)
 
 	if (!new)
 		return ERR_PTR(-ENOMEM);
+	if (current_cpuset_is_being_rebound()) {
+		nodemask_t mems = cpuset_mems_allowed(current);
+		mpol_rebind_policy(old, &mems);
+	}
 	*new = *old;
 	atomic_set(&new->refcnt, 1);
 	if (new->policy == MPOL_BIND) {
@@ -1480,6 +1493,22 @@ void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
 	mpol_rebind_policy(tsk->mempolicy, new);
 }
 
+/*
+ * Rebind each vma in mm to new nodemask.
+ *
+ * Call holding a reference to mm.  Takes mm->mmap_sem during call.
+ */
+
+void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
+{
+	struct vm_area_struct *vma;
+
+	down_write(&mm->mmap_sem);
+	for (vma = mm->mmap; vma; vma = vma->vm_next)
+		mpol_rebind_policy(vma->vm_policy, new);
+	up_write(&mm->mmap_sem);
+}
+
 /*
  * Display pages allocated per node and memory policy via /proc.
  */
-- 
cgit v1.2.2


From 04c19fa6f16047abff2288ddbc1f0798ede5a849 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:02:00 -0800
Subject: [PATCH] cpuset: migrate all tasks in cpuset at once

Given the mechanism in the previous patch to handle rebinding the per-vma
mempolicies of all tasks in a cpuset that changes its memory placement, it is
now easier to handle the page migration requirements of such tasks at the same
time.

The previous code didn't actually attempt to migrate the pages of the tasks in
a cpuset whose memory placement changed until the next time each such task
tried to allocate memory.  This was undesirable, as users invoking memory page
migration exected to happen when the placement changed, not some unspecified
time later when the task needed more memory.

It is now trivial to handle the page migration at the same time as the per-vma
rebinding is done.

The routine cpuset.c:update_nodemask(), which handles changing a cpusets
memory placement ('mems') now checks for the special case of being asked to
write a placement that is the same as before.  It was harmless enough before
to just recompute everything again, even though nothing had changed.  But page
migration is a heavy weight operation - moving pages about.  So now it is
worth avoiding that if asked to move a cpuset to its current location.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 19f87565be..cf8203a5fa 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -639,25 +639,14 @@ void cpuset_update_task_memory_state()
 	task_unlock(tsk);
 
 	if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
-		nodemask_t oldmem = tsk->mems_allowed;
-		int migrate;
-
 		down(&callback_sem);
 		task_lock(tsk);
 		cs = tsk->cpuset;	/* Maybe changed when task not locked */
-		migrate = is_memory_migrate(cs);
 		guarantee_online_mems(cs, &tsk->mems_allowed);
 		tsk->cpuset_mems_generation = cs->mems_generation;
 		task_unlock(tsk);
 		up(&callback_sem);
 		mpol_rebind_task(tsk, &tsk->mems_allowed);
-		if (!nodes_equal(oldmem, tsk->mems_allowed)) {
-			if (migrate) {
-				do_migrate_pages(tsk->mm, &oldmem,
-					&tsk->mems_allowed,
-					MPOL_MF_MOVE_ALL);
-			}
-		}
 	}
 }
 
@@ -815,7 +804,9 @@ static int update_cpumask(struct cpuset *cs, char *buf)
  * Handle user request to change the 'mems' memory placement
  * of a cpuset.  Needs to validate the request, update the
  * cpusets mems_allowed and mems_generation, and for each
- * task in the cpuset, rebind any vma mempolicies.
+ * task in the cpuset, rebind any vma mempolicies and if
+ * the cpuset is marked 'memory_migrate', migrate the tasks
+ * pages to the new memory.
  *
  * Call with manage_sem held.  May take callback_sem during call.
  * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
@@ -826,9 +817,11 @@ static int update_cpumask(struct cpuset *cs, char *buf)
 static int update_nodemask(struct cpuset *cs, char *buf)
 {
 	struct cpuset trialcs;
+	nodemask_t oldmem;
 	struct task_struct *g, *p;
 	struct mm_struct **mmarray;
 	int i, n, ntasks;
+	int migrate;
 	int fudge;
 	int retval;
 
@@ -837,6 +830,11 @@ static int update_nodemask(struct cpuset *cs, char *buf)
 	if (retval < 0)
 		goto done;
 	nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, node_online_map);
+	oldmem = cs->mems_allowed;
+	if (nodes_equal(oldmem, trialcs.mems_allowed)) {
+		retval = 0;		/* Too easy - nothing to do */
+		goto done;
+	}
 	if (nodes_empty(trialcs.mems_allowed)) {
 		retval = -ENOSPC;
 		goto done;
@@ -908,12 +906,17 @@ static int update_nodemask(struct cpuset *cs, char *buf)
 	 * cpuset manage_sem, we know that no other rebind effort will
 	 * be contending for the global variable cpuset_being_rebound.
 	 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
-	 * is idempotent.
+	 * is idempotent.  Also migrate pages in each mm to new nodes.
 	 */
+	migrate = is_memory_migrate(cs);
 	for (i = 0; i < n; i++) {
 		struct mm_struct *mm = mmarray[i];
 
 		mpol_rebind_mm(mm, &cs->mems_allowed);
+		if (migrate) {
+			do_migrate_pages(mm, &oldmem, &cs->mems_allowed,
+							MPOL_MF_MOVE_ALL);
+		}
 		mmput(mm);
 	}
 
-- 
cgit v1.2.2


From c417f0242ebe578924a30d4e53d35b5059fed4e7 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:02:01 -0800
Subject: [PATCH] cpuset: remove test for null cpuset from alloc code path

Remove a couple of more lines of code from the cpuset hooks in the page
allocation code path.

There was a check for a NULL cpuset pointer in the routine
cpuset_update_task_memory_state() that was only needed during system boot,
after the memory subsystem was initialized, before the cpuset subsystem was
initialized, to catch a NULL task->cpuset pointer.

Add a cpuset_init_early() routine, just before the mem_init() call in
init/main.c, that sets up just enough of the init tasks cpuset structure to
render cpuset_update_task_memory_state() calls harmless.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/cpuset.h |  2 ++
 init/main.c            |  1 +
 kernel/cpuset.c        | 22 ++++++++++++++++------
 3 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 34081c168a..c472f972bd 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -16,6 +16,7 @@
 
 extern int number_of_cpusets;	/* How many cpusets are defined in system? */
 
+extern int cpuset_init_early(void);
 extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
 extern void cpuset_fork(struct task_struct *p);
@@ -49,6 +50,7 @@ extern char *cpuset_task_status_allowed(struct task_struct *task, char *buffer);
 
 #else /* !CONFIG_CPUSETS */
 
+static inline int cpuset_init_early(void) { return 0; }
 static inline int cpuset_init(void) { return 0; }
 static inline void cpuset_init_smp(void) {}
 static inline void cpuset_fork(struct task_struct *p) {}
diff --git a/init/main.c b/init/main.c
index 2ed3638dee..afe5eb84ad 100644
--- a/init/main.c
+++ b/init/main.c
@@ -512,6 +512,7 @@ asmlinkage void __init start_kernel(void)
 	}
 #endif
 	vfs_caches_init_early();
+	cpuset_init_early();
 	mem_init();
 	kmem_cache_init();
 	setup_per_cpu_pageset();
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index cf8203a5fa..fc949e4a62 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -603,9 +603,7 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
  * Do not call this routine if in_interrupt().
  *
  * Call without callback_sem or task_lock() held.  May be called
- * with or without manage_sem held.  Except in early boot or
- * an exiting task, when tsk->cpuset is NULL, this routine will
- * acquire task_lock().  We don't need to use task_lock to guard
+ * with or without manage_sem held.  Doesn't need task_lock to guard
  * against another task changing a non-NULL cpuset pointer to NULL,
  * as that is only done by a task on itself, and if the current task
  * is here, it is not simultaneously in the exit code NULL'ing its
@@ -631,9 +629,6 @@ void cpuset_update_task_memory_state()
 	struct task_struct *tsk = current;
 	struct cpuset *cs = tsk->cpuset;
 
-	if (unlikely(!cs))
-		return;
-
 	task_lock(tsk);
 	my_cpusets_mem_gen = cs->mems_generation;
 	task_unlock(tsk);
@@ -1836,6 +1831,21 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
 	return 0;
 }
 
+/*
+ * cpuset_init_early - just enough so that the calls to
+ * cpuset_update_task_memory_state() in early init code
+ * are harmless.
+ */
+
+int __init cpuset_init_early(void)
+{
+	struct task_struct *tsk = current;
+
+	tsk->cpuset = &top_cpuset;
+	tsk->cpuset->mems_generation = atomic_read(&cpuset_mems_generation);
+	return 0;
+}
+
 /**
  * cpuset_init - initialize cpusets at system boot
  *
-- 
cgit v1.2.2


From 6b9c2603ce07f70de9c7a8d335ecd028e8ff11f3 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:02:02 -0800
Subject: [PATCH] cpuset: use rcu directly optimization

Optimize the cpuset impact on page allocation, the most performance critical
cpuset hook in the kernel.

On each page allocation, the cpuset hook needs to check for a possible change
in the current tasks cpuset.  It can now handle the common case, of no change,
without taking any spinlock or semaphore, thanks to RCU.

Convert a spinlock on the current task to an rcu_read_lock(), saving
approximately a memory barrier and an atomic op, depending on architecture.

This is done by adding rcu_assign_pointer() and synchronize_rcu() calls to the
write side of the task->cpuset pointer, in cpuset.c:attach_task(), to delay
freeing up a detached cpuset until after any critical sections referencing
that pointer.

Thanks to Andi Kleen, Nick Piggin and Eric Dumazet for ideas.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 40 ++++++++++++++++++++++++++++++----------
 1 file changed, 30 insertions(+), 10 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index fc949e4a62..6fe28d6f28 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -39,6 +39,7 @@
 #include <linux/namei.h>
 #include <linux/pagemap.h>
 #include <linux/proc_fs.h>
+#include <linux/rcupdate.h>
 #include <linux/sched.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
@@ -248,6 +249,11 @@ static struct super_block *cpuset_sb;
  * a tasks cpuset pointer we use task_lock(), which acts on a spinlock
  * (task->alloc_lock) already in the task_struct routinely used for
  * such matters.
+ *
+ * P.S.  One more locking exception.  RCU is used to guard the
+ * update of a tasks cpuset pointer by attach_task() and the
+ * access of task->cpuset->mems_generation via that pointer in
+ * the routine cpuset_update_task_memory_state().
  */
 
 static DECLARE_MUTEX(manage_sem);
@@ -610,12 +616,24 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
  * cpuset pointer.  This routine also might acquire callback_sem and
  * current->mm->mmap_sem during call.
  *
- * The task_lock() is required to dereference current->cpuset safely.
- * Without it, we could pick up the pointer value of current->cpuset
- * in one instruction, and then attach_task could give us a different
- * cpuset, and then the cpuset we had could be removed and freed,
- * and then on our next instruction, we could dereference a no longer
- * valid cpuset pointer to get its mems_generation field.
+ * Reading current->cpuset->mems_generation doesn't need task_lock
+ * to guard the current->cpuset derefence, because it is guarded
+ * from concurrent freeing of current->cpuset by attach_task(),
+ * using RCU.
+ *
+ * The rcu_dereference() is technically probably not needed,
+ * as I don't actually mind if I see a new cpuset pointer but
+ * an old value of mems_generation.  However this really only
+ * matters on alpha systems using cpusets heavily.  If I dropped
+ * that rcu_dereference(), it would save them a memory barrier.
+ * For all other arch's, rcu_dereference is a no-op anyway, and for
+ * alpha systems not using cpusets, another planned optimization,
+ * avoiding the rcu critical section for tasks in the root cpuset
+ * which is statically allocated, so can't vanish, will make this
+ * irrelevant.  Better to use RCU as intended, than to engage in
+ * some cute trick to save a memory barrier that is impossible to
+ * test, for alpha systems using cpusets heavily, which might not
+ * even exist.
  *
  * This routine is needed to update the per-task mems_allowed data,
  * within the tasks context, when it is trying to allocate memory
@@ -627,11 +645,12 @@ void cpuset_update_task_memory_state()
 {
 	int my_cpusets_mem_gen;
 	struct task_struct *tsk = current;
-	struct cpuset *cs = tsk->cpuset;
+	struct cpuset *cs;
 
-	task_lock(tsk);
+	rcu_read_lock();
+	cs = rcu_dereference(tsk->cpuset);
 	my_cpusets_mem_gen = cs->mems_generation;
-	task_unlock(tsk);
+	rcu_read_unlock();
 
 	if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
 		down(&callback_sem);
@@ -1131,7 +1150,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
 		return -ESRCH;
 	}
 	atomic_inc(&cs->count);
-	tsk->cpuset = cs;
+	rcu_assign_pointer(tsk->cpuset, cs);
 	task_unlock(tsk);
 
 	guarantee_online_cpus(cs, &cpus);
@@ -1151,6 +1170,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
 	if (is_memory_migrate(cs))
 		do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL);
 	put_task_struct(tsk);
+	synchronize_rcu();
 	if (atomic_dec_and_test(&oldcs->count))
 		check_for_release(oldcs, ppathbuf);
 	return 0;
-- 
cgit v1.2.2


From 7edc59628b2f5d6516b4677b3b56f5f056e45cd9 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:02:03 -0800
Subject: [PATCH] cpuset: mark number_of_cpusets read_mostly

Mark cpuset global 'number_of_cpusets' as __read_mostly.

This global is accessed everytime a zone is considered in the zonelist loops
beneath __alloc_pages, looking for a free memory page.  If number_of_cpusets
is just one, then we can short circuit the mems_allowed check.

Since this global is read alot on a hot path, and written rarely, it is an
excellent candidate for __read_mostly.

Thanks to Christoph Lameter for the suggestion.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 6fe28d6f28..681a5d58d4 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -62,7 +62,7 @@
  * When there is only one cpuset (the root cpuset) we can
  * short circuit some hooks.
  */
-int number_of_cpusets;
+int number_of_cpusets __read_mostly;
 
 /* See "Frequency meter" comments, below. */
 
-- 
cgit v1.2.2


From 03a285f58064b8e0af08383e082e383753d9c33e Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:02:04 -0800
Subject: [PATCH] cpuset: skip rcu check if task is in root cpuset

For systems that aren't using cpusets, but have them CONFIG_CPUSET enabled in
their kernel (eventually this may be most distribution kernels), this patch
removes even the minimal rcu_read_lock() from the memory page allocation path.

Actually, it removes that rcu call for any task that is in the root cpuset
(top_cpuset), which on systems not actively using cpusets, is all tasks.

We don't need the rcu check for tasks in the top_cpuset, because the
top_cpuset is statically allocated, so at no risk of being freed out from
underneath us.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 681a5d58d4..e04c2da9da 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -647,10 +647,15 @@ void cpuset_update_task_memory_state()
 	struct task_struct *tsk = current;
 	struct cpuset *cs;
 
-	rcu_read_lock();
-	cs = rcu_dereference(tsk->cpuset);
-	my_cpusets_mem_gen = cs->mems_generation;
-	rcu_read_unlock();
+	if (tsk->cpuset == &top_cpuset) {
+		/* Don't need rcu for top_cpuset.  It's never freed. */
+		my_cpusets_mem_gen = top_cpuset.mems_generation;
+	} else {
+		rcu_read_lock();
+		cs = rcu_dereference(tsk->cpuset);
+		my_cpusets_mem_gen = cs->mems_generation;
+		rcu_read_unlock();
+	}
 
 	if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
 		down(&callback_sem);
-- 
cgit v1.2.2


From de25968cc87cc5b76d09de8b4cbddc8f24fcf5f7 Mon Sep 17 00:00:00 2001
From: Tim Schmielau <tim@physik3.uni-rostock.de>
Date: Sun, 8 Jan 2006 01:02:05 -0800
Subject: [PATCH] fix more missing includes

Include fixes for 2.6.14-git11.  Should allow to remove sched.h from
module.h on i386, x86_64, arm, ia64, ppc, ppc64, and s390.  Probably more
to come since I haven't yet checked the other archs.

Signed-off-by: Tim Schmielau <tim@physik3.uni-rostock.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/arm/common/scoop.c                   | 1 +
 arch/arm/mach-realview/localtimer.c       | 1 +
 arch/mips/sgi-ip27/ip27-berr.c            | 1 +
 drivers/char/agp/sworks-agp.c             | 1 +
 drivers/infiniband/hw/mthca/mthca_dev.h   | 1 +
 drivers/infiniband/ulp/srp/ib_srp.c       | 1 +
 drivers/macintosh/windfarm_smu_controls.c | 1 +
 drivers/macintosh/windfarm_smu_sensors.c  | 1 +
 drivers/mtd/onenand/generic.c             | 1 +
 drivers/mtd/rfd_ftl.c                     | 1 +
 drivers/pci/hotplug/pciehp.h              | 1 +
 drivers/pci/hotplug/pciehp_hpc.c          | 3 +++
 drivers/rapidio/rio-scan.c                | 2 ++
 drivers/rapidio/rio-sysfs.c               | 1 +
 drivers/rapidio/rio.c                     | 1 +
 drivers/usb/host/ohci-au1xxx.c            | 1 +
 drivers/usb/host/ohci-lh7a404.c           | 1 +
 drivers/usb/host/ohci-ppc-soc.c           | 1 +
 include/linux/rio_drv.h                   | 1 +
 19 files changed, 22 insertions(+)

diff --git a/arch/arm/common/scoop.c b/arch/arm/common/scoop.c
index b6de43e736..a2dfe0b0f1 100644
--- a/arch/arm/common/scoop.c
+++ b/arch/arm/common/scoop.c
@@ -13,6 +13,7 @@
 
 #include <linux/device.h>
 #include <linux/string.h>
+#include <linux/slab.h>
 #include <linux/platform_device.h>
 #include <asm/io.h>
 #include <asm/hardware/scoop.h>
diff --git a/arch/arm/mach-realview/localtimer.c b/arch/arm/mach-realview/localtimer.c
index c9d7c596b2..caf6b8bb6c 100644
--- a/arch/arm/mach-realview/localtimer.c
+++ b/arch/arm/mach-realview/localtimer.c
@@ -13,6 +13,7 @@
 #include <linux/delay.h>
 #include <linux/device.h>
 #include <linux/smp.h>
+#include <linux/jiffies.h>
 
 #include <asm/mach/time.h>
 #include <asm/hardware/arm_twd.h>
diff --git a/arch/mips/sgi-ip27/ip27-berr.c b/arch/mips/sgi-ip27/ip27-berr.c
index 07631a9767..ce907eda22 100644
--- a/arch/mips/sgi-ip27/ip27-berr.c
+++ b/arch/mips/sgi-ip27/ip27-berr.c
@@ -11,6 +11,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/signal.h>	/* for SIGBUS */
+#include <linux/sched.h>	/* schow_regs(), force_sig() */
 
 #include <asm/module.h>
 #include <asm/sn/addrs.h>
diff --git a/drivers/char/agp/sworks-agp.c b/drivers/char/agp/sworks-agp.c
index 3f8f7fa6b0..268f78d926 100644
--- a/drivers/char/agp/sworks-agp.c
+++ b/drivers/char/agp/sworks-agp.c
@@ -7,6 +7,7 @@
 #include <linux/init.h>
 #include <linux/string.h>
 #include <linux/slab.h>
+#include <linux/jiffies.h>
 #include <linux/agp_backend.h>
 #include "agp.h"
 
diff --git a/drivers/infiniband/hw/mthca/mthca_dev.h b/drivers/infiniband/hw/mthca/mthca_dev.h
index 497ff794ef..795b379260 100644
--- a/drivers/infiniband/hw/mthca/mthca_dev.h
+++ b/drivers/infiniband/hw/mthca/mthca_dev.h
@@ -43,6 +43,7 @@
 #include <linux/kernel.h>
 #include <linux/pci.h>
 #include <linux/dma-mapping.h>
+#include <linux/timer.h>
 #include <asm/semaphore.h>
 
 #include "mthca_provider.h"
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index ee9fe226ae..dd488d3cff 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -39,6 +39,7 @@
 #include <linux/string.h>
 #include <linux/parser.h>
 #include <linux/random.h>
+#include <linux/jiffies.h>
 
 #include <asm/atomic.h>
 
diff --git a/drivers/macintosh/windfarm_smu_controls.c b/drivers/macintosh/windfarm_smu_controls.c
index 2c3158c81f..4d811600bd 100644
--- a/drivers/macintosh/windfarm_smu_controls.c
+++ b/drivers/macintosh/windfarm_smu_controls.c
@@ -14,6 +14,7 @@
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/wait.h>
+#include <linux/completion.h>
 #include <asm/prom.h>
 #include <asm/machdep.h>
 #include <asm/io.h>
diff --git a/drivers/macintosh/windfarm_smu_sensors.c b/drivers/macintosh/windfarm_smu_sensors.c
index b558cc209d..1a00d9c75a 100644
--- a/drivers/macintosh/windfarm_smu_sensors.c
+++ b/drivers/macintosh/windfarm_smu_sensors.c
@@ -14,6 +14,7 @@
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/wait.h>
+#include <linux/completion.h>
 #include <asm/prom.h>
 #include <asm/machdep.h>
 #include <asm/io.h>
diff --git a/drivers/mtd/onenand/generic.c b/drivers/mtd/onenand/generic.c
index 45c077d0f0..af06a80f44 100644
--- a/drivers/mtd/onenand/generic.c
+++ b/drivers/mtd/onenand/generic.c
@@ -14,6 +14,7 @@
 
 #include <linux/module.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/platform_device.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/onenand.h>
diff --git a/drivers/mtd/rfd_ftl.c b/drivers/mtd/rfd_ftl.c
index 20ce212638..a3e00a4635 100644
--- a/drivers/mtd/rfd_ftl.c
+++ b/drivers/mtd/rfd_ftl.c
@@ -18,6 +18,7 @@
 #include <linux/mtd/blktrans.h>
 #include <linux/mtd/mtd.h>
 #include <linux/vmalloc.h>
+#include <linux/slab.h>
 #include <linux/jiffies.h>
 
 #include <asm/types.h>
diff --git a/drivers/pci/hotplug/pciehp.h b/drivers/pci/hotplug/pciehp.h
index 6a61b9f286..0aac6a6133 100644
--- a/drivers/pci/hotplug/pciehp.h
+++ b/drivers/pci/hotplug/pciehp.h
@@ -32,6 +32,7 @@
 #include <linux/types.h>
 #include <linux/pci.h>
 #include <linux/delay.h>
+#include <linux/sched.h>		/* signal_pending() */
 #include <linux/pcieport_if.h>
 #include "pci_hotplug.h"
 
diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c
index 0b8b26beb1..ac1e495c31 100644
--- a/drivers/pci/hotplug/pciehp_hpc.c
+++ b/drivers/pci/hotplug/pciehp_hpc.c
@@ -30,6 +30,9 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/types.h>
+#include <linux/signal.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
 #include <linux/pci.h>
 #include <linux/interrupt.h>
 
diff --git a/drivers/rapidio/rio-scan.c b/drivers/rapidio/rio-scan.c
index 4f7ed4bd3b..94e30fe4b8 100644
--- a/drivers/rapidio/rio-scan.c
+++ b/drivers/rapidio/rio-scan.c
@@ -24,6 +24,8 @@
 #include <linux/module.h>
 #include <linux/spinlock.h>
 #include <linux/timer.h>
+#include <linux/jiffies.h>
+#include <linux/slab.h>
 
 #include "rio.h"
 
diff --git a/drivers/rapidio/rio-sysfs.c b/drivers/rapidio/rio-sysfs.c
index 30a11436e2..bef9316e95 100644
--- a/drivers/rapidio/rio-sysfs.c
+++ b/drivers/rapidio/rio-sysfs.c
@@ -15,6 +15,7 @@
 #include <linux/rio.h>
 #include <linux/rio_drv.h>
 #include <linux/stat.h>
+#include <linux/sched.h>	/* for capable() */
 
 #include "rio.h"
 
diff --git a/drivers/rapidio/rio.c b/drivers/rapidio/rio.c
index 3ca1011cea..5e382470fa 100644
--- a/drivers/rapidio/rio.c
+++ b/drivers/rapidio/rio.c
@@ -23,6 +23,7 @@
 #include <linux/rio_regs.h>
 #include <linux/module.h>
 #include <linux/spinlock.h>
+#include <linux/slab.h>
 
 #include "rio.h"
 
diff --git a/drivers/usb/host/ohci-au1xxx.c b/drivers/usb/host/ohci-au1xxx.c
index d9cf3b327d..77cd6ac07e 100644
--- a/drivers/usb/host/ohci-au1xxx.c
+++ b/drivers/usb/host/ohci-au1xxx.c
@@ -19,6 +19,7 @@
  */
 
 #include <linux/platform_device.h>
+#include <linux/signal.h>
 
 #include <asm/mach-au1x00/au1000.h>
 
diff --git a/drivers/usb/host/ohci-lh7a404.c b/drivers/usb/host/ohci-lh7a404.c
index 3959ccc883..0020ed7a39 100644
--- a/drivers/usb/host/ohci-lh7a404.c
+++ b/drivers/usb/host/ohci-lh7a404.c
@@ -17,6 +17,7 @@
  */
 
 #include <linux/platform_device.h>
+#include <linux/signal.h>
 
 #include <asm/hardware.h>
 
diff --git a/drivers/usb/host/ohci-ppc-soc.c b/drivers/usb/host/ohci-ppc-soc.c
index 2ec6a78bd6..b2a8dfa488 100644
--- a/drivers/usb/host/ohci-ppc-soc.c
+++ b/drivers/usb/host/ohci-ppc-soc.c
@@ -15,6 +15,7 @@
  */
 
 #include <linux/platform_device.h>
+#include <linux/signal.h>
 
 /* configure so an HC device and id are always provided */
 /* always called with process context; sleeping is OK */
diff --git a/include/linux/rio_drv.h b/include/linux/rio_drv.h
index 3bd7cce19e..157d7e3236 100644
--- a/include/linux/rio_drv.h
+++ b/include/linux/rio_drv.h
@@ -21,6 +21,7 @@
 #include <linux/list.h>
 #include <linux/errno.h>
 #include <linux/device.h>
+#include <linux/string.h>
 #include <linux/rio.h>
 
 extern int __rio_local_read_config_32(struct rio_mport *port, u32 offset,
-- 
cgit v1.2.2


From 705b6c7b34f2621f95f606d0e683daa10cdb8eb9 Mon Sep 17 00:00:00 2001
From: Paul Fulghum <paulkf@microgate.com>
Date: Sun, 8 Jan 2006 01:02:06 -0800
Subject: [PATCH] new driver synclink_gt

New character device driver for the SyncLink GT and SyncLink AC families of
synchronous and asynchronous serial adapters

Signed-off-by: Paul Fulghum <paulkf@microgate.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/Kconfig       |    8 +
 drivers/char/Makefile      |    1 +
 drivers/char/synclink_gt.c | 4501 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/synclink.h   |    9 +-
 4 files changed, 4518 insertions(+), 1 deletion(-)
 create mode 100644 drivers/char/synclink_gt.c

diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index b1b34edcd7..dd7e6901c5 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -220,6 +220,14 @@ config SYNCLINKMP
 	  The module will be called synclinkmp.  If you want to do that, say M
 	  here.
 
+config SYNCLINK_GT
+	tristate "SyncLink GT/AC support"
+	depends on SERIAL_NONSTANDARD
+	help
+	  Support for SyncLink GT and SyncLink AC families of
+	  synchronous and asynchronous serial adapters
+	  manufactured by Microgate Systems, Ltd. (www.microgate.com)
+
 config N_HDLC
 	tristate "HDLC line discipline support"
 	depends on SERIAL_NONSTANDARD
diff --git a/drivers/char/Makefile b/drivers/char/Makefile
index 4aeae687e8..d973d14d8f 100644
--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -36,6 +36,7 @@ obj-$(CONFIG_RISCOM8)		+= riscom8.o
 obj-$(CONFIG_ISI)		+= isicom.o
 obj-$(CONFIG_SYNCLINK)		+= synclink.o
 obj-$(CONFIG_SYNCLINKMP)	+= synclinkmp.o
+obj-$(CONFIG_SYNCLINK_GT)	+= synclink_gt.o
 obj-$(CONFIG_N_HDLC)		+= n_hdlc.o
 obj-$(CONFIG_AMIGA_BUILTIN_SERIAL) += amiserial.o
 obj-$(CONFIG_SX)		+= sx.o generic_serial.o
diff --git a/drivers/char/synclink_gt.c b/drivers/char/synclink_gt.c
new file mode 100644
index 0000000000..2b9cde94e2
--- /dev/null
+++ b/drivers/char/synclink_gt.c
@@ -0,0 +1,4501 @@
+/*
+ * $Id: synclink_gt.c,v 4.20 2005/11/08 19:51:55 paulkf Exp $
+ *
+ * Device driver for Microgate SyncLink GT serial adapters.
+ *
+ * written by Paul Fulghum for Microgate Corporation
+ * paulkf@microgate.com
+ *
+ * Microgate and SyncLink are trademarks of Microgate Corporation
+ *
+ * This code is released under the GNU General Public License (GPL)
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * DEBUG OUTPUT DEFINITIONS
+ *
+ * uncomment lines below to enable specific types of debug output
+ *
+ * DBGINFO   information - most verbose output
+ * DBGERR    serious errors
+ * DBGBH     bottom half service routine debugging
+ * DBGISR    interrupt service routine debugging
+ * DBGDATA   output receive and transmit data
+ * DBGTBUF   output transmit DMA buffers and registers
+ * DBGRBUF   output receive DMA buffers and registers
+ */
+
+#define DBGINFO(fmt) if (debug_level >= DEBUG_LEVEL_INFO) printk fmt
+#define DBGERR(fmt) if (debug_level >= DEBUG_LEVEL_ERROR) printk fmt
+#define DBGBH(fmt) if (debug_level >= DEBUG_LEVEL_BH) printk fmt
+#define DBGISR(fmt) if (debug_level >= DEBUG_LEVEL_ISR) printk fmt
+#define DBGDATA(info, buf, size, label) if (debug_level >= DEBUG_LEVEL_DATA) trace_block((info), (buf), (size), (label))
+//#define DBGTBUF(info) dump_tbufs(info)
+//#define DBGRBUF(info) dump_rbufs(info)
+
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/version.h>
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/tty.h>
+#include <linux/tty_flip.h>
+#include <linux/serial.h>
+#include <linux/major.h>
+#include <linux/string.h>
+#include <linux/fcntl.h>
+#include <linux/ptrace.h>
+#include <linux/ioport.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/netdevice.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/ioctl.h>
+#include <linux/termios.h>
+#include <linux/bitops.h>
+#include <linux/workqueue.h>
+#include <linux/hdlc.h>
+
+#include <asm/serial.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/irq.h>
+#include <asm/dma.h>
+#include <asm/types.h>
+#include <asm/uaccess.h>
+
+#include "linux/synclink.h"
+
+#ifdef CONFIG_HDLC_MODULE
+#define CONFIG_HDLC 1
+#endif
+
+/*
+ * module identification
+ */
+static char *driver_name     = "SyncLink GT";
+static char *driver_version  = "$Revision: 4.20 $";
+static char *tty_driver_name = "synclink_gt";
+static char *tty_dev_prefix  = "ttySLG";
+MODULE_LICENSE("GPL");
+#define MGSL_MAGIC 0x5401
+#define MAX_DEVICES 12
+
+static struct pci_device_id pci_table[] = {
+	{PCI_VENDOR_ID_MICROGATE, SYNCLINK_GT_DEVICE_ID, PCI_ANY_ID, PCI_ANY_ID,},
+	{PCI_VENDOR_ID_MICROGATE, SYNCLINK_GT4_DEVICE_ID, PCI_ANY_ID, PCI_ANY_ID,},
+	{PCI_VENDOR_ID_MICROGATE, SYNCLINK_AC_DEVICE_ID, PCI_ANY_ID, PCI_ANY_ID,},
+	{0,}, /* terminate list */
+};
+MODULE_DEVICE_TABLE(pci, pci_table);
+
+static int  init_one(struct pci_dev *dev,const struct pci_device_id *ent);
+static void remove_one(struct pci_dev *dev);
+static struct pci_driver pci_driver = {
+	.name		= "synclink_gt",
+	.id_table	= pci_table,
+	.probe		= init_one,
+	.remove		= __devexit_p(remove_one),
+};
+
+static int pci_registered;
+
+/*
+ * module configuration and status
+ */
+static struct slgt_info *slgt_device_list;
+static int slgt_device_count;
+
+static int ttymajor;
+static int debug_level;
+static int maxframe[MAX_DEVICES];
+static int dosyncppp[MAX_DEVICES];
+
+module_param(ttymajor, int, 0);
+module_param(debug_level, int, 0);
+module_param_array(maxframe, int, NULL, 0);
+module_param_array(dosyncppp, int, NULL, 0);
+
+MODULE_PARM_DESC(ttymajor, "TTY major device number override: 0=auto assigned");
+MODULE_PARM_DESC(debug_level, "Debug syslog output: 0=disabled, 1 to 5=increasing detail");
+MODULE_PARM_DESC(maxframe, "Maximum frame size used by device (4096 to 65535)");
+MODULE_PARM_DESC(dosyncppp, "Enable synchronous net device, 0=disable 1=enable");
+
+/*
+ * tty support and callbacks
+ */
+#define RELEVANT_IFLAG(iflag) (iflag & (IGNBRK|BRKINT|IGNPAR|PARMRK|INPCK))
+
+static struct tty_driver *serial_driver;
+
+static int  open(struct tty_struct *tty, struct file * filp);
+static void close(struct tty_struct *tty, struct file * filp);
+static void hangup(struct tty_struct *tty);
+static void set_termios(struct tty_struct *tty, struct termios *old_termios);
+
+static int  write(struct tty_struct *tty, const unsigned char *buf, int count);
+static void put_char(struct tty_struct *tty, unsigned char ch);
+static void send_xchar(struct tty_struct *tty, char ch);
+static void wait_until_sent(struct tty_struct *tty, int timeout);
+static int  write_room(struct tty_struct *tty);
+static void flush_chars(struct tty_struct *tty);
+static void flush_buffer(struct tty_struct *tty);
+static void tx_hold(struct tty_struct *tty);
+static void tx_release(struct tty_struct *tty);
+
+static int  ioctl(struct tty_struct *tty, struct file *file, unsigned int cmd, unsigned long arg);
+static int  read_proc(char *page, char **start, off_t off, int count,int *eof, void *data);
+static int  chars_in_buffer(struct tty_struct *tty);
+static void throttle(struct tty_struct * tty);
+static void unthrottle(struct tty_struct * tty);
+static void set_break(struct tty_struct *tty, int break_state);
+
+/*
+ * generic HDLC support and callbacks
+ */
+#ifdef CONFIG_HDLC
+#define dev_to_port(D) (dev_to_hdlc(D)->priv)
+static void hdlcdev_tx_done(struct slgt_info *info);
+static void hdlcdev_rx(struct slgt_info *info, char *buf, int size);
+static int  hdlcdev_init(struct slgt_info *info);
+static void hdlcdev_exit(struct slgt_info *info);
+#endif
+
+
+/*
+ * device specific structures, macros and functions
+ */
+
+#define SLGT_MAX_PORTS 4
+#define SLGT_REG_SIZE  256
+
+/*
+ * DMA buffer descriptor and access macros
+ */
+struct slgt_desc
+{
+	unsigned short count;
+	unsigned short status;
+	unsigned int pbuf;  /* physical address of data buffer */
+	unsigned int next;  /* physical address of next descriptor */
+
+	/* driver book keeping */
+	char *buf;          /* virtual  address of data buffer */
+    	unsigned int pdesc; /* physical address of this descriptor */
+	dma_addr_t buf_dma_addr;
+};
+
+#define set_desc_buffer(a,b) (a).pbuf = cpu_to_le32((unsigned int)(b))
+#define set_desc_next(a,b) (a).next   = cpu_to_le32((unsigned int)(b))
+#define set_desc_count(a,b)(a).count  = cpu_to_le16((unsigned short)(b))
+#define set_desc_eof(a,b)  (a).status = cpu_to_le16((b) ? (le16_to_cpu((a).status) | BIT0) : (le16_to_cpu((a).status) & ~BIT0))
+#define desc_count(a)      (le16_to_cpu((a).count))
+#define desc_status(a)     (le16_to_cpu((a).status))
+#define desc_complete(a)   (le16_to_cpu((a).status) & BIT15)
+#define desc_eof(a)        (le16_to_cpu((a).status) & BIT2)
+#define desc_crc_error(a)  (le16_to_cpu((a).status) & BIT1)
+#define desc_abort(a)      (le16_to_cpu((a).status) & BIT0)
+#define desc_residue(a)    ((le16_to_cpu((a).status) & 0x38) >> 3)
+
+struct _input_signal_events {
+	int ri_up;
+	int ri_down;
+	int dsr_up;
+	int dsr_down;
+	int dcd_up;
+	int dcd_down;
+	int cts_up;
+	int cts_down;
+};
+
+/*
+ * device instance data structure
+ */
+struct slgt_info {
+	void *if_ptr;		/* General purpose pointer (used by SPPP) */
+
+	struct slgt_info *next_device;	/* device list link */
+
+	int magic;
+	int flags;
+
+	char device_name[25];
+	struct pci_dev *pdev;
+
+	int port_count;  /* count of ports on adapter */
+	int adapter_num; /* adapter instance number */
+	int port_num;    /* port instance number */
+
+	/* array of pointers to port contexts on this adapter */
+	struct slgt_info *port_array[SLGT_MAX_PORTS];
+
+	int			count;		/* count of opens */
+	int			line;		/* tty line instance number */
+	unsigned short		close_delay;
+	unsigned short		closing_wait;	/* time to wait before closing */
+
+	struct mgsl_icount	icount;
+
+	struct tty_struct 	*tty;
+	int			timeout;
+	int			x_char;		/* xon/xoff character */
+	int			blocked_open;	/* # of blocked opens */
+	unsigned int		read_status_mask;
+	unsigned int 		ignore_status_mask;
+
+	wait_queue_head_t	open_wait;
+	wait_queue_head_t	close_wait;
+
+	wait_queue_head_t	status_event_wait_q;
+	wait_queue_head_t	event_wait_q;
+	struct timer_list	tx_timer;
+	struct timer_list	rx_timer;
+
+	spinlock_t lock;	/* spinlock for synchronizing with ISR */
+
+	struct work_struct task;
+	u32 pending_bh;
+	int bh_requested;
+	int bh_running;
+
+	int isr_overflow;
+	int irq_requested;	/* nonzero if IRQ requested */
+	int irq_occurred;	/* for diagnostics use */
+
+	/* device configuration */
+
+	unsigned int bus_type;
+	unsigned int irq_level;
+	unsigned long irq_flags;
+
+	unsigned char __iomem * reg_addr;  /* memory mapped registers address */
+	u32 phys_reg_addr;
+	u32 reg_offset;
+	int reg_addr_requested;
+
+	MGSL_PARAMS params;       /* communications parameters */
+	u32 idle_mode;
+	u32 max_frame_size;       /* as set by device config */
+
+	unsigned int raw_rx_size;
+	unsigned int if_mode;
+
+	/* device status */
+
+	int rx_enabled;
+	int rx_restart;
+
+	int tx_enabled;
+	int tx_active;
+
+	unsigned char signals;    /* serial signal states */
+	unsigned int init_error;  /* initialization error */
+
+	unsigned char *tx_buf;
+	int tx_count;
+
+	char flag_buf[MAX_ASYNC_BUFFER_SIZE];
+	char char_buf[MAX_ASYNC_BUFFER_SIZE];
+	BOOLEAN drop_rts_on_tx_done;
+	struct	_input_signal_events	input_signal_events;
+
+	int dcd_chkcount;	/* check counts to prevent */
+	int cts_chkcount;	/* too many IRQs if a signal */
+	int dsr_chkcount;	/* is floating */
+	int ri_chkcount;
+
+	char *bufs;		/* virtual address of DMA buffer lists */
+	dma_addr_t bufs_dma_addr; /* physical address of buffer descriptors */
+
+	unsigned int rbuf_count;
+	struct slgt_desc *rbufs;
+	unsigned int rbuf_current;
+	unsigned int rbuf_index;
+
+	unsigned int tbuf_count;
+	struct slgt_desc *tbufs;
+	unsigned int tbuf_current;
+	unsigned int tbuf_start;
+
+	unsigned char *tmp_rbuf;
+	unsigned int tmp_rbuf_count;
+
+	/* SPPP/Cisco HDLC device parts */
+
+	int netcount;
+	int dosyncppp;
+	spinlock_t netlock;
+#ifdef CONFIG_HDLC
+	struct net_device *netdev;
+#endif
+
+};
+
+static MGSL_PARAMS default_params = {
+	.mode            = MGSL_MODE_HDLC,
+	.loopback        = 0,
+	.flags           = HDLC_FLAG_UNDERRUN_ABORT15,
+	.encoding        = HDLC_ENCODING_NRZI_SPACE,
+	.clock_speed     = 0,
+	.addr_filter     = 0xff,
+	.crc_type        = HDLC_CRC_16_CCITT,
+	.preamble_length = HDLC_PREAMBLE_LENGTH_8BITS,
+	.preamble        = HDLC_PREAMBLE_PATTERN_NONE,
+	.data_rate       = 9600,
+	.data_bits       = 8,
+	.stop_bits       = 1,
+	.parity          = ASYNC_PARITY_NONE
+};
+
+
+#define BH_RECEIVE  1
+#define BH_TRANSMIT 2
+#define BH_STATUS   4
+#define IO_PIN_SHUTDOWN_LIMIT 100
+
+#define DMABUFSIZE 256
+#define DESC_LIST_SIZE 4096
+
+#define MASK_PARITY  BIT1
+#define MASK_FRAMING BIT2
+#define MASK_BREAK   BIT3
+#define MASK_OVERRUN BIT4
+
+#define GSR   0x00 /* global status */
+#define TDR   0x80 /* tx data */
+#define RDR   0x80 /* rx data */
+#define TCR   0x82 /* tx control */
+#define TIR   0x84 /* tx idle */
+#define TPR   0x85 /* tx preamble */
+#define RCR   0x86 /* rx control */
+#define VCR   0x88 /* V.24 control */
+#define CCR   0x89 /* clock control */
+#define BDR   0x8a /* baud divisor */
+#define SCR   0x8c /* serial control */
+#define SSR   0x8e /* serial status */
+#define RDCSR 0x90 /* rx DMA control/status */
+#define TDCSR 0x94 /* tx DMA control/status */
+#define RDDAR 0x98 /* rx DMA descriptor address */
+#define TDDAR 0x9c /* tx DMA descriptor address */
+
+#define RXIDLE      BIT14
+#define RXBREAK     BIT14
+#define IRQ_TXDATA  BIT13
+#define IRQ_TXIDLE  BIT12
+#define IRQ_TXUNDER BIT11 /* HDLC */
+#define IRQ_RXDATA  BIT10
+#define IRQ_RXIDLE  BIT9  /* HDLC */
+#define IRQ_RXBREAK BIT9  /* async */
+#define IRQ_RXOVER  BIT8
+#define IRQ_DSR     BIT7
+#define IRQ_CTS     BIT6
+#define IRQ_DCD     BIT5
+#define IRQ_RI      BIT4
+#define IRQ_ALL     0x3ff0
+#define IRQ_MASTER  BIT0
+
+#define slgt_irq_on(info, mask) \
+	wr_reg16((info), SCR, (unsigned short)(rd_reg16((info), SCR) | (mask)))
+#define slgt_irq_off(info, mask) \
+	wr_reg16((info), SCR, (unsigned short)(rd_reg16((info), SCR) & ~(mask)))
+
+static __u8  rd_reg8(struct slgt_info *info, unsigned int addr);
+static void  wr_reg8(struct slgt_info *info, unsigned int addr, __u8 value);
+static __u16 rd_reg16(struct slgt_info *info, unsigned int addr);
+static void  wr_reg16(struct slgt_info *info, unsigned int addr, __u16 value);
+static __u32 rd_reg32(struct slgt_info *info, unsigned int addr);
+static void  wr_reg32(struct slgt_info *info, unsigned int addr, __u32 value);
+
+static void  msc_set_vcr(struct slgt_info *info);
+
+static int  startup(struct slgt_info *info);
+static int  block_til_ready(struct tty_struct *tty, struct file * filp,struct slgt_info *info);
+static void shutdown(struct slgt_info *info);
+static void program_hw(struct slgt_info *info);
+static void change_params(struct slgt_info *info);
+
+static int  register_test(struct slgt_info *info);
+static int  irq_test(struct slgt_info *info);
+static int  loopback_test(struct slgt_info *info);
+static int  adapter_test(struct slgt_info *info);
+
+static void reset_adapter(struct slgt_info *info);
+static void reset_port(struct slgt_info *info);
+static void async_mode(struct slgt_info *info);
+static void hdlc_mode(struct slgt_info *info);
+
+static void rx_stop(struct slgt_info *info);
+static void rx_start(struct slgt_info *info);
+static void reset_rbufs(struct slgt_info *info);
+static void free_rbufs(struct slgt_info *info, unsigned int first, unsigned int last);
+static void rdma_reset(struct slgt_info *info);
+static int  rx_get_frame(struct slgt_info *info);
+static int  rx_get_buf(struct slgt_info *info);
+
+static void tx_start(struct slgt_info *info);
+static void tx_stop(struct slgt_info *info);
+static void tx_set_idle(struct slgt_info *info);
+static unsigned int free_tbuf_count(struct slgt_info *info);
+static void reset_tbufs(struct slgt_info *info);
+static void tdma_reset(struct slgt_info *info);
+static void tx_load(struct slgt_info *info, const char *buf, unsigned int count);
+
+static void get_signals(struct slgt_info *info);
+static void set_signals(struct slgt_info *info);
+static void enable_loopback(struct slgt_info *info);
+static void set_rate(struct slgt_info *info, u32 data_rate);
+
+static int  bh_action(struct slgt_info *info);
+static void bh_handler(void* context);
+static void bh_transmit(struct slgt_info *info);
+static void isr_serial(struct slgt_info *info);
+static void isr_rdma(struct slgt_info *info);
+static void isr_txeom(struct slgt_info *info, unsigned short status);
+static void isr_tdma(struct slgt_info *info);
+static irqreturn_t slgt_interrupt(int irq, void *dev_id, struct pt_regs * regs);
+
+static int  alloc_dma_bufs(struct slgt_info *info);
+static void free_dma_bufs(struct slgt_info *info);
+static int  alloc_desc(struct slgt_info *info);
+static void free_desc(struct slgt_info *info);
+static int  alloc_bufs(struct slgt_info *info, struct slgt_desc *bufs, int count);
+static void free_bufs(struct slgt_info *info, struct slgt_desc *bufs, int count);
+
+static int  alloc_tmp_rbuf(struct slgt_info *info);
+static void free_tmp_rbuf(struct slgt_info *info);
+
+static void tx_timeout(unsigned long context);
+static void rx_timeout(unsigned long context);
+
+/*
+ * ioctl handlers
+ */
+static int  get_stats(struct slgt_info *info, struct mgsl_icount __user *user_icount);
+static int  get_params(struct slgt_info *info, MGSL_PARAMS __user *params);
+static int  set_params(struct slgt_info *info, MGSL_PARAMS __user *params);
+static int  get_txidle(struct slgt_info *info, int __user *idle_mode);
+static int  set_txidle(struct slgt_info *info, int idle_mode);
+static int  tx_enable(struct slgt_info *info, int enable);
+static int  tx_abort(struct slgt_info *info);
+static int  rx_enable(struct slgt_info *info, int enable);
+static int  modem_input_wait(struct slgt_info *info,int arg);
+static int  wait_mgsl_event(struct slgt_info *info, int __user *mask_ptr);
+static int  tiocmget(struct tty_struct *tty, struct file *file);
+static int  tiocmset(struct tty_struct *tty, struct file *file,
+		     unsigned int set, unsigned int clear);
+static void set_break(struct tty_struct *tty, int break_state);
+static int  get_interface(struct slgt_info *info, int __user *if_mode);
+static int  set_interface(struct slgt_info *info, int if_mode);
+
+/*
+ * driver functions
+ */
+static void add_device(struct slgt_info *info);
+static void device_init(int adapter_num, struct pci_dev *pdev);
+static int  claim_resources(struct slgt_info *info);
+static void release_resources(struct slgt_info *info);
+
+/*
+ * DEBUG OUTPUT CODE
+ */
+#ifndef DBGINFO
+#define DBGINFO(fmt)
+#endif
+#ifndef DBGERR
+#define DBGERR(fmt)
+#endif
+#ifndef DBGBH
+#define DBGBH(fmt)
+#endif
+#ifndef DBGISR
+#define DBGISR(fmt)
+#endif
+
+#ifdef DBGDATA
+static void trace_block(struct slgt_info *info, const char *data, int count, const char *label)
+{
+	int i;
+	int linecount;
+	printk("%s %s data:\n",info->device_name, label);
+	while(count) {
+		linecount = (count > 16) ? 16 : count;
+		for(i=0; i < linecount; i++)
+			printk("%02X ",(unsigned char)data[i]);
+		for(;i<17;i++)
+			printk("   ");
+		for(i=0;i<linecount;i++) {
+			if (data[i]>=040 && data[i]<=0176)
+				printk("%c",data[i]);
+			else
+				printk(".");
+		}
+		printk("\n");
+		data  += linecount;
+		count -= linecount;
+	}
+}
+#else
+#define DBGDATA(info, buf, size, label)
+#endif
+
+#ifdef DBGTBUF
+static void dump_tbufs(struct slgt_info *info)
+{
+	int i;
+	printk("tbuf_current=%d\n", info->tbuf_current);
+	for (i=0 ; i < info->tbuf_count ; i++) {
+		printk("%d: count=%04X status=%04X\n",
+			i, le16_to_cpu(info->tbufs[i].count), le16_to_cpu(info->tbufs[i].status));
+	}
+}
+#else
+#define DBGTBUF(info)
+#endif
+
+#ifdef DBGRBUF
+static void dump_rbufs(struct slgt_info *info)
+{
+	int i;
+	printk("rbuf_current=%d\n", info->rbuf_current);
+	for (i=0 ; i < info->rbuf_count ; i++) {
+		printk("%d: count=%04X status=%04X\n",
+			i, le16_to_cpu(info->rbufs[i].count), le16_to_cpu(info->rbufs[i].status));
+	}
+}
+#else
+#define DBGRBUF(info)
+#endif
+
+static inline int sanity_check(struct slgt_info *info, char *devname, const char *name)
+{
+#ifdef SANITY_CHECK
+	if (!info) {
+		printk("null struct slgt_info for (%s) in %s\n", devname, name);
+		return 1;
+	}
+	if (info->magic != MGSL_MAGIC) {
+		printk("bad magic number struct slgt_info (%s) in %s\n", devname, name);
+		return 1;
+	}
+#else
+	if (!info)
+		return 1;
+#endif
+	return 0;
+}
+
+/**
+ * line discipline callback wrappers
+ *
+ * The wrappers maintain line discipline references
+ * while calling into the line discipline.
+ *
+ * ldisc_receive_buf  - pass receive data to line discipline
+ */
+static void ldisc_receive_buf(struct tty_struct *tty,
+			      const __u8 *data, char *flags, int count)
+{
+	struct tty_ldisc *ld;
+	if (!tty)
+		return;
+	ld = tty_ldisc_ref(tty);
+	if (ld) {
+		if (ld->receive_buf)
+			ld->receive_buf(tty, data, flags, count);
+		tty_ldisc_deref(ld);
+	}
+}
+
+/* tty callbacks */
+
+static int open(struct tty_struct *tty, struct file *filp)
+{
+	struct slgt_info *info;
+	int retval, line;
+	unsigned long flags;
+
+	line = tty->index;
+	if ((line < 0) || (line >= slgt_device_count)) {
+		DBGERR(("%s: open with invalid line #%d.\n", driver_name, line));
+		return -ENODEV;
+	}
+
+	info = slgt_device_list;
+	while(info && info->line != line)
+		info = info->next_device;
+	if (sanity_check(info, tty->name, "open"))
+		return -ENODEV;
+	if (info->init_error) {
+		DBGERR(("%s init error=%d\n", info->device_name, info->init_error));
+		return -ENODEV;
+	}
+
+	tty->driver_data = info;
+	info->tty = tty;
+
+	DBGINFO(("%s open, old ref count = %d\n", info->device_name, info->count));
+
+	/* If port is closing, signal caller to try again */
+	if (tty_hung_up_p(filp) || info->flags & ASYNC_CLOSING){
+		if (info->flags & ASYNC_CLOSING)
+			interruptible_sleep_on(&info->close_wait);
+		retval = ((info->flags & ASYNC_HUP_NOTIFY) ?
+			-EAGAIN : -ERESTARTSYS);
+		goto cleanup;
+	}
+
+	info->tty->low_latency = (info->flags & ASYNC_LOW_LATENCY) ? 1 : 0;
+
+	spin_lock_irqsave(&info->netlock, flags);
+	if (info->netcount) {
+		retval = -EBUSY;
+		spin_unlock_irqrestore(&info->netlock, flags);
+		goto cleanup;
+	}
+	info->count++;
+	spin_unlock_irqrestore(&info->netlock, flags);
+
+	if (info->count == 1) {
+		/* 1st open on this device, init hardware */
+		retval = startup(info);
+		if (retval < 0)
+			goto cleanup;
+	}
+
+	retval = block_til_ready(tty, filp, info);
+	if (retval) {
+		DBGINFO(("%s block_til_ready rc=%d\n", info->device_name, retval));
+		goto cleanup;
+	}
+
+	retval = 0;
+
+cleanup:
+	if (retval) {
+		if (tty->count == 1)
+			info->tty = NULL; /* tty layer will release tty struct */
+		if(info->count)
+			info->count--;
+	}
+
+	DBGINFO(("%s open rc=%d\n", info->device_name, retval));
+	return retval;
+}
+
+static void close(struct tty_struct *tty, struct file *filp)
+{
+	struct slgt_info *info = tty->driver_data;
+
+	if (sanity_check(info, tty->name, "close"))
+		return;
+	DBGINFO(("%s close entry, count=%d\n", info->device_name, info->count));
+
+	if (!info->count)
+		return;
+
+	if (tty_hung_up_p(filp))
+		goto cleanup;
+
+	if ((tty->count == 1) && (info->count != 1)) {
+		/*
+		 * tty->count is 1 and the tty structure will be freed.
+		 * info->count should be one in this case.
+		 * if it's not, correct it so that the port is shutdown.
+		 */
+		DBGERR(("%s close: bad refcount; tty->count=1, "
+		       "info->count=%d\n", info->device_name, info->count));
+		info->count = 1;
+	}
+
+	info->count--;
+
+	/* if at least one open remaining, leave hardware active */
+	if (info->count)
+		goto cleanup;
+
+	info->flags |= ASYNC_CLOSING;
+
+	/* set tty->closing to notify line discipline to
+	 * only process XON/XOFF characters. Only the N_TTY
+	 * discipline appears to use this (ppp does not).
+	 */
+	tty->closing = 1;
+
+	/* wait for transmit data to clear all layers */
+
+	if (info->closing_wait != ASYNC_CLOSING_WAIT_NONE) {
+		DBGINFO(("%s call tty_wait_until_sent\n", info->device_name));
+		tty_wait_until_sent(tty, info->closing_wait);
+	}
+
+ 	if (info->flags & ASYNC_INITIALIZED)
+ 		wait_until_sent(tty, info->timeout);
+	if (tty->driver->flush_buffer)
+		tty->driver->flush_buffer(tty);
+	tty_ldisc_flush(tty);
+
+	shutdown(info);
+
+	tty->closing = 0;
+	info->tty = NULL;
+
+	if (info->blocked_open) {
+		if (info->close_delay) {
+			msleep_interruptible(jiffies_to_msecs(info->close_delay));
+		}
+		wake_up_interruptible(&info->open_wait);
+	}
+
+	info->flags &= ~(ASYNC_NORMAL_ACTIVE|ASYNC_CLOSING);
+
+	wake_up_interruptible(&info->close_wait);
+
+cleanup:
+	DBGINFO(("%s close exit, count=%d\n", tty->driver->name, info->count));
+}
+
+static void hangup(struct tty_struct *tty)
+{
+	struct slgt_info *info = tty->driver_data;
+
+	if (sanity_check(info, tty->name, "hangup"))
+		return;
+	DBGINFO(("%s hangup\n", info->device_name));
+
+	flush_buffer(tty);
+	shutdown(info);
+
+	info->count = 0;
+	info->flags &= ~ASYNC_NORMAL_ACTIVE;
+	info->tty = NULL;
+
+	wake_up_interruptible(&info->open_wait);
+}
+
+static void set_termios(struct tty_struct *tty, struct termios *old_termios)
+{
+	struct slgt_info *info = tty->driver_data;
+	unsigned long flags;
+
+	DBGINFO(("%s set_termios\n", tty->driver->name));
+
+	/* just return if nothing has changed */
+	if ((tty->termios->c_cflag == old_termios->c_cflag)
+	    && (RELEVANT_IFLAG(tty->termios->c_iflag)
+		== RELEVANT_IFLAG(old_termios->c_iflag)))
+		return;
+
+	change_params(info);
+
+	/* Handle transition to B0 status */
+	if (old_termios->c_cflag & CBAUD &&
+	    !(tty->termios->c_cflag & CBAUD)) {
+		info->signals &= ~(SerialSignal_RTS + SerialSignal_DTR);
+		spin_lock_irqsave(&info->lock,flags);
+		set_signals(info);
+		spin_unlock_irqrestore(&info->lock,flags);
+	}
+
+	/* Handle transition away from B0 status */
+	if (!(old_termios->c_cflag & CBAUD) &&
+	    tty->termios->c_cflag & CBAUD) {
+		info->signals |= SerialSignal_DTR;
+ 		if (!(tty->termios->c_cflag & CRTSCTS) ||
+ 		    !test_bit(TTY_THROTTLED, &tty->flags)) {
+			info->signals |= SerialSignal_RTS;
+ 		}
+		spin_lock_irqsave(&info->lock,flags);
+	 	set_signals(info);
+		spin_unlock_irqrestore(&info->lock,flags);
+	}
+
+	/* Handle turning off CRTSCTS */
+	if (old_termios->c_cflag & CRTSCTS &&
+	    !(tty->termios->c_cflag & CRTSCTS)) {
+		tty->hw_stopped = 0;
+		tx_release(tty);
+	}
+}
+
+static int write(struct tty_struct *tty,
+		 const unsigned char *buf, int count)
+{
+	int ret = 0;
+	struct slgt_info *info = tty->driver_data;
+	unsigned long flags;
+
+	if (sanity_check(info, tty->name, "write"))
+		goto cleanup;
+	DBGINFO(("%s write count=%d\n", info->device_name, count));
+
+	if (!tty || !info->tx_buf)
+		goto cleanup;
+
+	if (count > info->max_frame_size) {
+		ret = -EIO;
+		goto cleanup;
+	}
+
+	if (!count)
+		goto cleanup;
+
+	if (info->params.mode == MGSL_MODE_RAW) {
+		unsigned int bufs_needed = (count/DMABUFSIZE);
+		unsigned int bufs_free = free_tbuf_count(info);
+		if (count % DMABUFSIZE)
+			++bufs_needed;
+		if (bufs_needed > bufs_free)
+			goto cleanup;
+	} else {
+		if (info->tx_active)
+			goto cleanup;
+		if (info->tx_count) {
+			/* send accumulated data from send_char() calls */
+			/* as frame and wait before accepting more data. */
+			tx_load(info, info->tx_buf, info->tx_count);
+			goto start;
+		}
+	}
+
+	ret = info->tx_count = count;
+	tx_load(info, buf, count);
+	goto start;
+
+start:
+ 	if (info->tx_count && !tty->stopped && !tty->hw_stopped) {
+		spin_lock_irqsave(&info->lock,flags);
+		if (!info->tx_active)
+		 	tx_start(info);
+		spin_unlock_irqrestore(&info->lock,flags);
+ 	}
+
+cleanup:
+	DBGINFO(("%s write rc=%d\n", info->device_name, ret));
+	return ret;
+}
+
+static void put_char(struct tty_struct *tty, unsigned char ch)
+{
+	struct slgt_info *info = tty->driver_data;
+	unsigned long flags;
+
+	if (sanity_check(info, tty->name, "put_char"))
+		return;
+	DBGINFO(("%s put_char(%d)\n", info->device_name, ch));
+	if (!tty || !info->tx_buf)
+		return;
+	spin_lock_irqsave(&info->lock,flags);
+	if (!info->tx_active && (info->tx_count < info->max_frame_size))
+		info->tx_buf[info->tx_count++] = ch;
+	spin_unlock_irqrestore(&info->lock,flags);
+}
+
+static void send_xchar(struct tty_struct *tty, char ch)
+{
+	struct slgt_info *info = tty->driver_data;
+	unsigned long flags;
+
+	if (sanity_check(info, tty->name, "send_xchar"))
+		return;
+	DBGINFO(("%s send_xchar(%d)\n", info->device_name, ch));
+	info->x_char = ch;
+	if (ch) {
+		spin_lock_irqsave(&info->lock,flags);
+		if (!info->tx_enabled)
+		 	tx_start(info);
+		spin_unlock_irqrestore(&info->lock,flags);
+	}
+}
+
+static void wait_until_sent(struct tty_struct *tty, int timeout)
+{
+	struct slgt_info *info = tty->driver_data;
+	unsigned long orig_jiffies, char_time;
+
+	if (!info )
+		return;
+	if (sanity_check(info, tty->name, "wait_until_sent"))
+		return;
+	DBGINFO(("%s wait_until_sent entry\n", info->device_name));
+	if (!(info->flags & ASYNC_INITIALIZED))
+		goto exit;
+
+	orig_jiffies = jiffies;
+
+	/* Set check interval to 1/5 of estimated time to
+	 * send a character, and make it at least 1. The check
+	 * interval should also be less than the timeout.
+	 * Note: use tight timings here to satisfy the NIST-PCTS.
+	 */
+
+	if (info->params.data_rate) {
+	       	char_time = info->timeout/(32 * 5);
+		if (!char_time)
+			char_time++;
+	} else
+		char_time = 1;
+
+	if (timeout)
+		char_time = min_t(unsigned long, char_time, timeout);
+
+	while (info->tx_active) {
+		msleep_interruptible(jiffies_to_msecs(char_time));
+		if (signal_pending(current))
+			break;
+		if (timeout && time_after(jiffies, orig_jiffies + timeout))
+			break;
+	}
+
+exit:
+	DBGINFO(("%s wait_until_sent exit\n", info->device_name));
+}
+
+static int write_room(struct tty_struct *tty)
+{
+	struct slgt_info *info = tty->driver_data;
+	int ret;
+
+	if (sanity_check(info, tty->name, "write_room"))
+		return 0;
+	ret = (info->tx_active) ? 0 : HDLC_MAX_FRAME_SIZE;
+	DBGINFO(("%s write_room=%d\n", info->device_name, ret));
+	return ret;
+}
+
+static void flush_chars(struct tty_struct *tty)
+{
+	struct slgt_info *info = tty->driver_data;
+	unsigned long flags;
+
+	if (sanity_check(info, tty->name, "flush_chars"))
+		return;
+	DBGINFO(("%s flush_chars entry tx_count=%d\n", info->device_name, info->tx_count));
+
+	if (info->tx_count <= 0 || tty->stopped ||
+	    tty->hw_stopped || !info->tx_buf)
+		return;
+
+	DBGINFO(("%s flush_chars start transmit\n", info->device_name));
+
+	spin_lock_irqsave(&info->lock,flags);
+	if (!info->tx_active && info->tx_count) {
+		tx_load(info, info->tx_buf,info->tx_count);
+	 	tx_start(info);
+	}
+	spin_unlock_irqrestore(&info->lock,flags);
+}
+
+static void flush_buffer(struct tty_struct *tty)
+{
+	struct slgt_info *info = tty->driver_data;
+	unsigned long flags;
+
+	if (sanity_check(info, tty->name, "flush_buffer"))
+		return;
+	DBGINFO(("%s flush_buffer\n", info->device_name));
+
+	spin_lock_irqsave(&info->lock,flags);
+	if (!info->tx_active)
+		info->tx_count = 0;
+	spin_unlock_irqrestore(&info->lock,flags);
+
+	wake_up_interruptible(&tty->write_wait);
+	tty_wakeup(tty);
+}
+
+/*
+ * throttle (stop) transmitter
+ */
+static void tx_hold(struct tty_struct *tty)
+{
+	struct slgt_info *info = tty->driver_data;
+	unsigned long flags;
+
+	if (sanity_check(info, tty->name, "tx_hold"))
+		return;
+	DBGINFO(("%s tx_hold\n", info->device_name));
+	spin_lock_irqsave(&info->lock,flags);
+	if (info->tx_enabled && info->params.mode == MGSL_MODE_ASYNC)
+	 	tx_stop(info);
+	spin_unlock_irqrestore(&info->lock,flags);
+}
+
+/*
+ * release (start) transmitter
+ */
+static void tx_release(struct tty_struct *tty)
+{
+	struct slgt_info *info = tty->driver_data;
+	unsigned long flags;
+
+	if (sanity_check(info, tty->name, "tx_release"))
+		return;
+	DBGINFO(("%s tx_release\n", info->device_name));
+	spin_lock_irqsave(&info->lock,flags);
+	if (!info->tx_active && info->tx_count) {
+		tx_load(info, info->tx_buf, info->tx_count);
+	 	tx_start(info);
+	}
+	spin_unlock_irqrestore(&info->lock,flags);
+}
+
+/*
+ * Service an IOCTL request
+ *
+ * Arguments
+ *
+ * 	tty	pointer to tty instance data
+ * 	file	pointer to associated file object for device
+ * 	cmd	IOCTL command code
+ * 	arg	command argument/context
+ *
+ * Return 0 if success, otherwise error code
+ */
+static int ioctl(struct tty_struct *tty, struct file *file,
+		 unsigned int cmd, unsigned long arg)
+{
+	struct slgt_info *info = tty->driver_data;
+	struct mgsl_icount cnow;	/* kernel counter temps */
+	struct serial_icounter_struct __user *p_cuser;	/* user space */
+	unsigned long flags;
+	void __user *argp = (void __user *)arg;
+
+	if (sanity_check(info, tty->name, "ioctl"))
+		return -ENODEV;
+	DBGINFO(("%s ioctl() cmd=%08X\n", info->device_name, cmd));
+
+	if ((cmd != TIOCGSERIAL) && (cmd != TIOCSSERIAL) &&
+	    (cmd != TIOCMIWAIT) && (cmd != TIOCGICOUNT)) {
+		if (tty->flags & (1 << TTY_IO_ERROR))
+		    return -EIO;
+	}
+
+	switch (cmd) {
+	case MGSL_IOCGPARAMS:
+		return get_params(info, argp);
+	case MGSL_IOCSPARAMS:
+		return set_params(info, argp);
+	case MGSL_IOCGTXIDLE:
+		return get_txidle(info, argp);
+	case MGSL_IOCSTXIDLE:
+		return set_txidle(info, (int)arg);
+	case MGSL_IOCTXENABLE:
+		return tx_enable(info, (int)arg);
+	case MGSL_IOCRXENABLE:
+		return rx_enable(info, (int)arg);
+	case MGSL_IOCTXABORT:
+		return tx_abort(info);
+	case MGSL_IOCGSTATS:
+		return get_stats(info, argp);
+	case MGSL_IOCWAITEVENT:
+		return wait_mgsl_event(info, argp);
+	case TIOCMIWAIT:
+		return modem_input_wait(info,(int)arg);
+	case MGSL_IOCGIF:
+		return get_interface(info, argp);
+	case MGSL_IOCSIF:
+		return set_interface(info,(int)arg);
+	case TIOCGICOUNT:
+		spin_lock_irqsave(&info->lock,flags);
+		cnow = info->icount;
+		spin_unlock_irqrestore(&info->lock,flags);
+		p_cuser = argp;
+		if (put_user(cnow.cts, &p_cuser->cts) ||
+		    put_user(cnow.dsr, &p_cuser->dsr) ||
+		    put_user(cnow.rng, &p_cuser->rng) ||
+		    put_user(cnow.dcd, &p_cuser->dcd) ||
+		    put_user(cnow.rx, &p_cuser->rx) ||
+		    put_user(cnow.tx, &p_cuser->tx) ||
+		    put_user(cnow.frame, &p_cuser->frame) ||
+		    put_user(cnow.overrun, &p_cuser->overrun) ||
+		    put_user(cnow.parity, &p_cuser->parity) ||
+		    put_user(cnow.brk, &p_cuser->brk) ||
+		    put_user(cnow.buf_overrun, &p_cuser->buf_overrun))
+			return -EFAULT;
+		return 0;
+	default:
+		return -ENOIOCTLCMD;
+	}
+	return 0;
+}
+
+/*
+ * proc fs support
+ */
+static inline int line_info(char *buf, struct slgt_info *info)
+{
+	char stat_buf[30];
+	int ret;
+	unsigned long flags;
+
+	ret = sprintf(buf, "%s: IO=%08X IRQ=%d MaxFrameSize=%u\n",
+		      info->device_name, info->phys_reg_addr,
+		      info->irq_level, info->max_frame_size);
+
+	/* output current serial signal states */
+	spin_lock_irqsave(&info->lock,flags);
+	get_signals(info);
+	spin_unlock_irqrestore(&info->lock,flags);
+
+	stat_buf[0] = 0;
+	stat_buf[1] = 0;
+	if (info->signals & SerialSignal_RTS)
+		strcat(stat_buf, "|RTS");
+	if (info->signals & SerialSignal_CTS)
+		strcat(stat_buf, "|CTS");
+	if (info->signals & SerialSignal_DTR)
+		strcat(stat_buf, "|DTR");
+	if (info->signals & SerialSignal_DSR)
+		strcat(stat_buf, "|DSR");
+	if (info->signals & SerialSignal_DCD)
+		strcat(stat_buf, "|CD");
+	if (info->signals & SerialSignal_RI)
+		strcat(stat_buf, "|RI");
+
+	if (info->params.mode != MGSL_MODE_ASYNC) {
+		ret += sprintf(buf+ret, "\tHDLC txok:%d rxok:%d",
+			       info->icount.txok, info->icount.rxok);
+		if (info->icount.txunder)
+			ret += sprintf(buf+ret, " txunder:%d", info->icount.txunder);
+		if (info->icount.txabort)
+			ret += sprintf(buf+ret, " txabort:%d", info->icount.txabort);
+		if (info->icount.rxshort)
+			ret += sprintf(buf+ret, " rxshort:%d", info->icount.rxshort);
+		if (info->icount.rxlong)
+			ret += sprintf(buf+ret, " rxlong:%d", info->icount.rxlong);
+		if (info->icount.rxover)
+			ret += sprintf(buf+ret, " rxover:%d", info->icount.rxover);
+		if (info->icount.rxcrc)
+			ret += sprintf(buf+ret, " rxcrc:%d", info->icount.rxcrc);
+	} else {
+		ret += sprintf(buf+ret, "\tASYNC tx:%d rx:%d",
+			       info->icount.tx, info->icount.rx);
+		if (info->icount.frame)
+			ret += sprintf(buf+ret, " fe:%d", info->icount.frame);
+		if (info->icount.parity)
+			ret += sprintf(buf+ret, " pe:%d", info->icount.parity);
+		if (info->icount.brk)
+			ret += sprintf(buf+ret, " brk:%d", info->icount.brk);
+		if (info->icount.overrun)
+			ret += sprintf(buf+ret, " oe:%d", info->icount.overrun);
+	}
+
+	/* Append serial signal status to end */
+	ret += sprintf(buf+ret, " %s\n", stat_buf+1);
+
+	ret += sprintf(buf+ret, "\ttxactive=%d bh_req=%d bh_run=%d pending_bh=%x\n",
+		       info->tx_active,info->bh_requested,info->bh_running,
+		       info->pending_bh);
+
+	return ret;
+}
+
+/* Called to print information about devices
+ */
+static int read_proc(char *page, char **start, off_t off, int count,
+		     int *eof, void *data)
+{
+	int len = 0, l;
+	off_t	begin = 0;
+	struct slgt_info *info;
+
+	len += sprintf(page, "synclink_gt driver:%s\n", driver_version);
+
+	info = slgt_device_list;
+	while( info ) {
+		l = line_info(page + len, info);
+		len += l;
+		if (len+begin > off+count)
+			goto done;
+		if (len+begin < off) {
+			begin += len;
+			len = 0;
+		}
+		info = info->next_device;
+	}
+
+	*eof = 1;
+done:
+	if (off >= len+begin)
+		return 0;
+	*start = page + (off-begin);
+	return ((count < begin+len-off) ? count : begin+len-off);
+}
+
+/*
+ * return count of bytes in transmit buffer
+ */
+static int chars_in_buffer(struct tty_struct *tty)
+{
+	struct slgt_info *info = tty->driver_data;
+	if (sanity_check(info, tty->name, "chars_in_buffer"))
+		return 0;
+	DBGINFO(("%s chars_in_buffer()=%d\n", info->device_name, info->tx_count));
+	return info->tx_count;
+}
+
+/*
+ * signal remote device to throttle send data (our receive data)
+ */
+static void throttle(struct tty_struct * tty)
+{
+	struct slgt_info *info = tty->driver_data;
+	unsigned long flags;
+
+	if (sanity_check(info, tty->name, "throttle"))
+		return;
+	DBGINFO(("%s throttle\n", info->device_name));
+	if (I_IXOFF(tty))
+		send_xchar(tty, STOP_CHAR(tty));
+ 	if (tty->termios->c_cflag & CRTSCTS) {
+		spin_lock_irqsave(&info->lock,flags);
+		info->signals &= ~SerialSignal_RTS;
+	 	set_signals(info);
+		spin_unlock_irqrestore(&info->lock,flags);
+	}
+}
+
+/*
+ * signal remote device to stop throttling send data (our receive data)
+ */
+static void unthrottle(struct tty_struct * tty)
+{
+	struct slgt_info *info = tty->driver_data;
+	unsigned long flags;
+
+	if (sanity_check(info, tty->name, "unthrottle"))
+		return;
+	DBGINFO(("%s unthrottle\n", info->device_name));
+	if (I_IXOFF(tty)) {
+		if (info->x_char)
+			info->x_char = 0;
+		else
+			send_xchar(tty, START_CHAR(tty));
+	}
+ 	if (tty->termios->c_cflag & CRTSCTS) {
+		spin_lock_irqsave(&info->lock,flags);
+		info->signals |= SerialSignal_RTS;
+	 	set_signals(info);
+		spin_unlock_irqrestore(&info->lock,flags);
+	}
+}
+
+/*
+ * set or clear transmit break condition
+ * break_state	-1=set break condition, 0=clear
+ */
+static void set_break(struct tty_struct *tty, int break_state)
+{
+	struct slgt_info *info = tty->driver_data;
+	unsigned short value;
+	unsigned long flags;
+
+	if (sanity_check(info, tty->name, "set_break"))
+		return;
+	DBGINFO(("%s set_break(%d)\n", info->device_name, break_state));
+
+	spin_lock_irqsave(&info->lock,flags);
+	value = rd_reg16(info, TCR);
+ 	if (break_state == -1)
+		value |= BIT6;
+	else
+		value &= ~BIT6;
+	wr_reg16(info, TCR, value);
+	spin_unlock_irqrestore(&info->lock,flags);
+}
+
+#ifdef CONFIG_HDLC
+
+/**
+ * called by generic HDLC layer when protocol selected (PPP, frame relay, etc.)
+ * set encoding and frame check sequence (FCS) options
+ *
+ * dev       pointer to network device structure
+ * encoding  serial encoding setting
+ * parity    FCS setting
+ *
+ * returns 0 if success, otherwise error code
+ */
+static int hdlcdev_attach(struct net_device *dev, unsigned short encoding,
+			  unsigned short parity)
+{
+	struct slgt_info *info = dev_to_port(dev);
+	unsigned char  new_encoding;
+	unsigned short new_crctype;
+
+	/* return error if TTY interface open */
+	if (info->count)
+		return -EBUSY;
+
+	DBGINFO(("%s hdlcdev_attach\n", info->device_name));
+
+	switch (encoding)
+	{
+	case ENCODING_NRZ:        new_encoding = HDLC_ENCODING_NRZ; break;
+	case ENCODING_NRZI:       new_encoding = HDLC_ENCODING_NRZI_SPACE; break;
+	case ENCODING_FM_MARK:    new_encoding = HDLC_ENCODING_BIPHASE_MARK; break;
+	case ENCODING_FM_SPACE:   new_encoding = HDLC_ENCODING_BIPHASE_SPACE; break;
+	case ENCODING_MANCHESTER: new_encoding = HDLC_ENCODING_BIPHASE_LEVEL; break;
+	default: return -EINVAL;
+	}
+
+	switch (parity)
+	{
+	case PARITY_NONE:            new_crctype = HDLC_CRC_NONE; break;
+	case PARITY_CRC16_PR1_CCITT: new_crctype = HDLC_CRC_16_CCITT; break;
+	case PARITY_CRC32_PR1_CCITT: new_crctype = HDLC_CRC_32_CCITT; break;
+	default: return -EINVAL;
+	}
+
+	info->params.encoding = new_encoding;
+	info->params.crc_type = new_crctype;;
+
+	/* if network interface up, reprogram hardware */
+	if (info->netcount)
+		program_hw(info);
+
+	return 0;
+}
+
+/**
+ * called by generic HDLC layer to send frame
+ *
+ * skb  socket buffer containing HDLC frame
+ * dev  pointer to network device structure
+ *
+ * returns 0 if success, otherwise error code
+ */
+static int hdlcdev_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct slgt_info *info = dev_to_port(dev);
+	struct net_device_stats *stats = hdlc_stats(dev);
+	unsigned long flags;
+
+	DBGINFO(("%s hdlc_xmit\n", dev->name));
+
+	/* stop sending until this frame completes */
+	netif_stop_queue(dev);
+
+	/* copy data to device buffers */
+	info->tx_count = skb->len;
+	tx_load(info, skb->data, skb->len);
+
+	/* update network statistics */
+	stats->tx_packets++;
+	stats->tx_bytes += skb->len;
+
+	/* done with socket buffer, so free it */
+	dev_kfree_skb(skb);
+
+	/* save start time for transmit timeout detection */
+	dev->trans_start = jiffies;
+
+	/* start hardware transmitter if necessary */
+	spin_lock_irqsave(&info->lock,flags);
+	if (!info->tx_active)
+	 	tx_start(info);
+	spin_unlock_irqrestore(&info->lock,flags);
+
+	return 0;
+}
+
+/**
+ * called by network layer when interface enabled
+ * claim resources and initialize hardware
+ *
+ * dev  pointer to network device structure
+ *
+ * returns 0 if success, otherwise error code
+ */
+static int hdlcdev_open(struct net_device *dev)
+{
+	struct slgt_info *info = dev_to_port(dev);
+	int rc;
+	unsigned long flags;
+
+	DBGINFO(("%s hdlcdev_open\n", dev->name));
+
+	/* generic HDLC layer open processing */
+	if ((rc = hdlc_open(dev)))
+		return rc;
+
+	/* arbitrate between network and tty opens */
+	spin_lock_irqsave(&info->netlock, flags);
+	if (info->count != 0 || info->netcount != 0) {
+		DBGINFO(("%s hdlc_open busy\n", dev->name));
+		spin_unlock_irqrestore(&info->netlock, flags);
+		return -EBUSY;
+	}
+	info->netcount=1;
+	spin_unlock_irqrestore(&info->netlock, flags);
+
+	/* claim resources and init adapter */
+	if ((rc = startup(info)) != 0) {
+		spin_lock_irqsave(&info->netlock, flags);
+		info->netcount=0;
+		spin_unlock_irqrestore(&info->netlock, flags);
+		return rc;
+	}
+
+	/* assert DTR and RTS, apply hardware settings */
+	info->signals |= SerialSignal_RTS + SerialSignal_DTR;
+	program_hw(info);
+
+	/* enable network layer transmit */
+	dev->trans_start = jiffies;
+	netif_start_queue(dev);
+
+	/* inform generic HDLC layer of current DCD status */
+	spin_lock_irqsave(&info->lock, flags);
+	get_signals(info);
+	spin_unlock_irqrestore(&info->lock, flags);
+	hdlc_set_carrier(info->signals & SerialSignal_DCD, dev);
+
+	return 0;
+}
+
+/**
+ * called by network layer when interface is disabled
+ * shutdown hardware and release resources
+ *
+ * dev  pointer to network device structure
+ *
+ * returns 0 if success, otherwise error code
+ */
+static int hdlcdev_close(struct net_device *dev)
+{
+	struct slgt_info *info = dev_to_port(dev);
+	unsigned long flags;
+
+	DBGINFO(("%s hdlcdev_close\n", dev->name));
+
+	netif_stop_queue(dev);
+
+	/* shutdown adapter and release resources */
+	shutdown(info);
+
+	hdlc_close(dev);
+
+	spin_lock_irqsave(&info->netlock, flags);
+	info->netcount=0;
+	spin_unlock_irqrestore(&info->netlock, flags);
+
+	return 0;
+}
+
+/**
+ * called by network layer to process IOCTL call to network device
+ *
+ * dev  pointer to network device structure
+ * ifr  pointer to network interface request structure
+ * cmd  IOCTL command code
+ *
+ * returns 0 if success, otherwise error code
+ */
+static int hdlcdev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+	const size_t size = sizeof(sync_serial_settings);
+	sync_serial_settings new_line;
+	sync_serial_settings __user *line = ifr->ifr_settings.ifs_ifsu.sync;
+	struct slgt_info *info = dev_to_port(dev);
+	unsigned int flags;
+
+	DBGINFO(("%s hdlcdev_ioctl\n", dev->name));
+
+	/* return error if TTY interface open */
+	if (info->count)
+		return -EBUSY;
+
+	if (cmd != SIOCWANDEV)
+		return hdlc_ioctl(dev, ifr, cmd);
+
+	switch(ifr->ifr_settings.type) {
+	case IF_GET_IFACE: /* return current sync_serial_settings */
+
+		ifr->ifr_settings.type = IF_IFACE_SYNC_SERIAL;
+		if (ifr->ifr_settings.size < size) {
+			ifr->ifr_settings.size = size; /* data size wanted */
+			return -ENOBUFS;
+		}
+
+		flags = info->params.flags & (HDLC_FLAG_RXC_RXCPIN | HDLC_FLAG_RXC_DPLL |
+					      HDLC_FLAG_RXC_BRG    | HDLC_FLAG_RXC_TXCPIN |
+					      HDLC_FLAG_TXC_TXCPIN | HDLC_FLAG_TXC_DPLL |
+					      HDLC_FLAG_TXC_BRG    | HDLC_FLAG_TXC_RXCPIN);
+
+		switch (flags){
+		case (HDLC_FLAG_RXC_RXCPIN | HDLC_FLAG_TXC_TXCPIN): new_line.clock_type = CLOCK_EXT; break;
+		case (HDLC_FLAG_RXC_BRG    | HDLC_FLAG_TXC_BRG):    new_line.clock_type = CLOCK_INT; break;
+		case (HDLC_FLAG_RXC_RXCPIN | HDLC_FLAG_TXC_BRG):    new_line.clock_type = CLOCK_TXINT; break;
+		case (HDLC_FLAG_RXC_RXCPIN | HDLC_FLAG_TXC_RXCPIN): new_line.clock_type = CLOCK_TXFROMRX; break;
+		default: new_line.clock_type = CLOCK_DEFAULT;
+		}
+
+		new_line.clock_rate = info->params.clock_speed;
+		new_line.loopback   = info->params.loopback ? 1:0;
+
+		if (copy_to_user(line, &new_line, size))
+			return -EFAULT;
+		return 0;
+
+	case IF_IFACE_SYNC_SERIAL: /* set sync_serial_settings */
+
+		if(!capable(CAP_NET_ADMIN))
+			return -EPERM;
+		if (copy_from_user(&new_line, line, size))
+			return -EFAULT;
+
+		switch (new_line.clock_type)
+		{
+		case CLOCK_EXT:      flags = HDLC_FLAG_RXC_RXCPIN | HDLC_FLAG_TXC_TXCPIN; break;
+		case CLOCK_TXFROMRX: flags = HDLC_FLAG_RXC_RXCPIN | HDLC_FLAG_TXC_RXCPIN; break;
+		case CLOCK_INT:      flags = HDLC_FLAG_RXC_BRG    | HDLC_FLAG_TXC_BRG;    break;
+		case CLOCK_TXINT:    flags = HDLC_FLAG_RXC_RXCPIN | HDLC_FLAG_TXC_BRG;    break;
+		case CLOCK_DEFAULT:  flags = info->params.flags &
+					     (HDLC_FLAG_RXC_RXCPIN | HDLC_FLAG_RXC_DPLL |
+					      HDLC_FLAG_RXC_BRG    | HDLC_FLAG_RXC_TXCPIN |
+					      HDLC_FLAG_TXC_TXCPIN | HDLC_FLAG_TXC_DPLL |
+					      HDLC_FLAG_TXC_BRG    | HDLC_FLAG_TXC_RXCPIN); break;
+		default: return -EINVAL;
+		}
+
+		if (new_line.loopback != 0 && new_line.loopback != 1)
+			return -EINVAL;
+
+		info->params.flags &= ~(HDLC_FLAG_RXC_RXCPIN | HDLC_FLAG_RXC_DPLL |
+					HDLC_FLAG_RXC_BRG    | HDLC_FLAG_RXC_TXCPIN |
+					HDLC_FLAG_TXC_TXCPIN | HDLC_FLAG_TXC_DPLL |
+					HDLC_FLAG_TXC_BRG    | HDLC_FLAG_TXC_RXCPIN);
+		info->params.flags |= flags;
+
+		info->params.loopback = new_line.loopback;
+
+		if (flags & (HDLC_FLAG_RXC_BRG | HDLC_FLAG_TXC_BRG))
+			info->params.clock_speed = new_line.clock_rate;
+		else
+			info->params.clock_speed = 0;
+
+		/* if network interface up, reprogram hardware */
+		if (info->netcount)
+			program_hw(info);
+		return 0;
+
+	default:
+		return hdlc_ioctl(dev, ifr, cmd);
+	}
+}
+
+/**
+ * called by network layer when transmit timeout is detected
+ *
+ * dev  pointer to network device structure
+ */
+static void hdlcdev_tx_timeout(struct net_device *dev)
+{
+	struct slgt_info *info = dev_to_port(dev);
+	struct net_device_stats *stats = hdlc_stats(dev);
+	unsigned long flags;
+
+	DBGINFO(("%s hdlcdev_tx_timeout\n", dev->name));
+
+	stats->tx_errors++;
+	stats->tx_aborted_errors++;
+
+	spin_lock_irqsave(&info->lock,flags);
+	tx_stop(info);
+	spin_unlock_irqrestore(&info->lock,flags);
+
+	netif_wake_queue(dev);
+}
+
+/**
+ * called by device driver when transmit completes
+ * reenable network layer transmit if stopped
+ *
+ * info  pointer to device instance information
+ */
+static void hdlcdev_tx_done(struct slgt_info *info)
+{
+	if (netif_queue_stopped(info->netdev))
+		netif_wake_queue(info->netdev);
+}
+
+/**
+ * called by device driver when frame received
+ * pass frame to network layer
+ *
+ * info  pointer to device instance information
+ * buf   pointer to buffer contianing frame data
+ * size  count of data bytes in buf
+ */
+static void hdlcdev_rx(struct slgt_info *info, char *buf, int size)
+{
+	struct sk_buff *skb = dev_alloc_skb(size);
+	struct net_device *dev = info->netdev;
+	struct net_device_stats *stats = hdlc_stats(dev);
+
+	DBGINFO(("%s hdlcdev_rx\n", dev->name));
+
+	if (skb == NULL) {
+		DBGERR(("%s: can't alloc skb, drop packet\n", dev->name));
+		stats->rx_dropped++;
+		return;
+	}
+
+	memcpy(skb_put(skb, size),buf,size);
+
+	skb->protocol = hdlc_type_trans(skb, info->netdev);
+
+	stats->rx_packets++;
+	stats->rx_bytes += size;
+
+	netif_rx(skb);
+
+	info->netdev->last_rx = jiffies;
+}
+
+/**
+ * called by device driver when adding device instance
+ * do generic HDLC initialization
+ *
+ * info  pointer to device instance information
+ *
+ * returns 0 if success, otherwise error code
+ */
+static int hdlcdev_init(struct slgt_info *info)
+{
+	int rc;
+	struct net_device *dev;
+	hdlc_device *hdlc;
+
+	/* allocate and initialize network and HDLC layer objects */
+
+	if (!(dev = alloc_hdlcdev(info))) {
+		printk(KERN_ERR "%s hdlc device alloc failure\n", info->device_name);
+		return -ENOMEM;
+	}
+
+	/* for network layer reporting purposes only */
+	dev->mem_start = info->phys_reg_addr;
+	dev->mem_end   = info->phys_reg_addr + SLGT_REG_SIZE - 1;
+	dev->irq       = info->irq_level;
+
+	/* network layer callbacks and settings */
+	dev->do_ioctl       = hdlcdev_ioctl;
+	dev->open           = hdlcdev_open;
+	dev->stop           = hdlcdev_close;
+	dev->tx_timeout     = hdlcdev_tx_timeout;
+	dev->watchdog_timeo = 10*HZ;
+	dev->tx_queue_len   = 50;
+
+	/* generic HDLC layer callbacks and settings */
+	hdlc         = dev_to_hdlc(dev);
+	hdlc->attach = hdlcdev_attach;
+	hdlc->xmit   = hdlcdev_xmit;
+
+	/* register objects with HDLC layer */
+	if ((rc = register_hdlc_device(dev))) {
+		printk(KERN_WARNING "%s:unable to register hdlc device\n",__FILE__);
+		free_netdev(dev);
+		return rc;
+	}
+
+	info->netdev = dev;
+	return 0;
+}
+
+/**
+ * called by device driver when removing device instance
+ * do generic HDLC cleanup
+ *
+ * info  pointer to device instance information
+ */
+static void hdlcdev_exit(struct slgt_info *info)
+{
+	unregister_hdlc_device(info->netdev);
+	free_netdev(info->netdev);
+	info->netdev = NULL;
+}
+
+#endif /* ifdef CONFIG_HDLC */
+
+/*
+ * get async data from rx DMA buffers
+ */
+static void rx_async(struct slgt_info *info)
+{
+ 	struct tty_struct *tty = info->tty;
+ 	struct mgsl_icount *icount = &info->icount;
+	unsigned int start, end;
+	unsigned char *p;
+	unsigned char status;
+	struct slgt_desc *bufs = info->rbufs;
+	int i, count;
+
+	start = end = info->rbuf_current;
+
+	while(desc_complete(bufs[end])) {
+		count = desc_count(bufs[end]) - info->rbuf_index;
+		p     = bufs[end].buf + info->rbuf_index;
+
+		DBGISR(("%s rx_async count=%d\n", info->device_name, count));
+		DBGDATA(info, p, count, "rx");
+
+		for(i=0 ; i < count; i+=2, p+=2) {
+			if (tty) {
+				if (tty->flip.count >= TTY_FLIPBUF_SIZE)
+					tty_flip_buffer_push(tty);
+				if (tty->flip.count >= TTY_FLIPBUF_SIZE)
+					break;
+				*tty->flip.char_buf_ptr = *p;
+				*tty->flip.flag_buf_ptr = 0;
+			}
+			icount->rx++;
+
+			if ((status = *(p+1) & (BIT9 + BIT8))) {
+				if (status & BIT9)
+					icount->parity++;
+				else if (status & BIT8)
+					icount->frame++;
+				/* discard char if tty control flags say so */
+				if (status & info->ignore_status_mask)
+					continue;
+				if (tty) {
+					if (status & BIT9)
+						*tty->flip.flag_buf_ptr = TTY_PARITY;
+					else if (status & BIT8)
+						*tty->flip.flag_buf_ptr = TTY_FRAME;
+				}
+			}
+			if (tty) {
+				tty->flip.flag_buf_ptr++;
+				tty->flip.char_buf_ptr++;
+				tty->flip.count++;
+			}
+		}
+
+		if (i < count) {
+			/* receive buffer not completed */
+			info->rbuf_index += i;
+			info->rx_timer.expires = jiffies + 1;
+			add_timer(&info->rx_timer);
+			break;
+		}
+
+		info->rbuf_index = 0;
+		free_rbufs(info, end, end);
+
+		if (++end == info->rbuf_count)
+			end = 0;
+
+		/* if entire list searched then no frame available */
+		if (end == start)
+			break;
+	}
+
+	if (tty && tty->flip.count)
+		tty_flip_buffer_push(tty);
+}
+
+/*
+ * return next bottom half action to perform
+ */
+static int bh_action(struct slgt_info *info)
+{
+	unsigned long flags;
+	int rc;
+
+	spin_lock_irqsave(&info->lock,flags);
+
+	if (info->pending_bh & BH_RECEIVE) {
+		info->pending_bh &= ~BH_RECEIVE;
+		rc = BH_RECEIVE;
+	} else if (info->pending_bh & BH_TRANSMIT) {
+		info->pending_bh &= ~BH_TRANSMIT;
+		rc = BH_TRANSMIT;
+	} else if (info->pending_bh & BH_STATUS) {
+		info->pending_bh &= ~BH_STATUS;
+		rc = BH_STATUS;
+	} else {
+		/* Mark BH routine as complete */
+		info->bh_running   = 0;
+		info->bh_requested = 0;
+		rc = 0;
+	}
+
+	spin_unlock_irqrestore(&info->lock,flags);
+
+	return rc;
+}
+
+/*
+ * perform bottom half processing
+ */
+static void bh_handler(void* context)
+{
+	struct slgt_info *info = context;
+	int action;
+
+	if (!info)
+		return;
+	info->bh_running = 1;
+
+	while((action = bh_action(info))) {
+		switch (action) {
+		case BH_RECEIVE:
+			DBGBH(("%s bh receive\n", info->device_name));
+			switch(info->params.mode) {
+			case MGSL_MODE_ASYNC:
+				rx_async(info);
+				break;
+			case MGSL_MODE_HDLC:
+				while(rx_get_frame(info));
+				break;
+			case MGSL_MODE_RAW:
+				while(rx_get_buf(info));
+				break;
+			}
+			/* restart receiver if rx DMA buffers exhausted */
+			if (info->rx_restart)
+				rx_start(info);
+			break;
+		case BH_TRANSMIT:
+			bh_transmit(info);
+			break;
+		case BH_STATUS:
+			DBGBH(("%s bh status\n", info->device_name));
+			info->ri_chkcount = 0;
+			info->dsr_chkcount = 0;
+			info->dcd_chkcount = 0;
+			info->cts_chkcount = 0;
+			break;
+		default:
+			DBGBH(("%s unknown action\n", info->device_name));
+			break;
+		}
+	}
+	DBGBH(("%s bh_handler exit\n", info->device_name));
+}
+
+static void bh_transmit(struct slgt_info *info)
+{
+	struct tty_struct *tty = info->tty;
+
+	DBGBH(("%s bh_transmit\n", info->device_name));
+	if (tty) {
+		tty_wakeup(tty);
+		wake_up_interruptible(&tty->write_wait);
+	}
+}
+
+static void dsr_change(struct slgt_info *info)
+{
+	get_signals(info);
+	DBGISR(("dsr_change %s signals=%04X\n", info->device_name, info->signals));
+	if ((info->dsr_chkcount)++ == IO_PIN_SHUTDOWN_LIMIT) {
+		slgt_irq_off(info, IRQ_DSR);
+		return;
+	}
+	info->icount.dsr++;
+	if (info->signals & SerialSignal_DSR)
+		info->input_signal_events.dsr_up++;
+	else
+		info->input_signal_events.dsr_down++;
+	wake_up_interruptible(&info->status_event_wait_q);
+	wake_up_interruptible(&info->event_wait_q);
+	info->pending_bh |= BH_STATUS;
+}
+
+static void cts_change(struct slgt_info *info)
+{
+	get_signals(info);
+	DBGISR(("cts_change %s signals=%04X\n", info->device_name, info->signals));
+	if ((info->cts_chkcount)++ == IO_PIN_SHUTDOWN_LIMIT) {
+		slgt_irq_off(info, IRQ_CTS);
+		return;
+	}
+	info->icount.cts++;
+	if (info->signals & SerialSignal_CTS)
+		info->input_signal_events.cts_up++;
+	else
+		info->input_signal_events.cts_down++;
+	wake_up_interruptible(&info->status_event_wait_q);
+	wake_up_interruptible(&info->event_wait_q);
+	info->pending_bh |= BH_STATUS;
+
+	if (info->flags & ASYNC_CTS_FLOW) {
+		if (info->tty) {
+			if (info->tty->hw_stopped) {
+				if (info->signals & SerialSignal_CTS) {
+		 			info->tty->hw_stopped = 0;
+					info->pending_bh |= BH_TRANSMIT;
+					return;
+				}
+			} else {
+				if (!(info->signals & SerialSignal_CTS))
+		 			info->tty->hw_stopped = 1;
+			}
+		}
+	}
+}
+
+static void dcd_change(struct slgt_info *info)
+{
+	get_signals(info);
+	DBGISR(("dcd_change %s signals=%04X\n", info->device_name, info->signals));
+	if ((info->dcd_chkcount)++ == IO_PIN_SHUTDOWN_LIMIT) {
+		slgt_irq_off(info, IRQ_DCD);
+		return;
+	}
+	info->icount.dcd++;
+	if (info->signals & SerialSignal_DCD) {
+		info->input_signal_events.dcd_up++;
+	} else {
+		info->input_signal_events.dcd_down++;
+	}
+#ifdef CONFIG_HDLC
+	if (info->netcount)
+		hdlc_set_carrier(info->signals & SerialSignal_DCD, info->netdev);
+#endif
+	wake_up_interruptible(&info->status_event_wait_q);
+	wake_up_interruptible(&info->event_wait_q);
+	info->pending_bh |= BH_STATUS;
+
+	if (info->flags & ASYNC_CHECK_CD) {
+		if (info->signals & SerialSignal_DCD)
+			wake_up_interruptible(&info->open_wait);
+		else {
+			if (info->tty)
+				tty_hangup(info->tty);
+		}
+	}
+}
+
+static void ri_change(struct slgt_info *info)
+{
+	get_signals(info);
+	DBGISR(("ri_change %s signals=%04X\n", info->device_name, info->signals));
+	if ((info->ri_chkcount)++ == IO_PIN_SHUTDOWN_LIMIT) {
+		slgt_irq_off(info, IRQ_RI);
+		return;
+	}
+	info->icount.dcd++;
+	if (info->signals & SerialSignal_RI) {
+		info->input_signal_events.ri_up++;
+	} else {
+		info->input_signal_events.ri_down++;
+	}
+	wake_up_interruptible(&info->status_event_wait_q);
+	wake_up_interruptible(&info->event_wait_q);
+	info->pending_bh |= BH_STATUS;
+}
+
+static void isr_serial(struct slgt_info *info)
+{
+	unsigned short status = rd_reg16(info, SSR);
+
+	DBGISR(("%s isr_serial status=%04X\n", info->device_name, status));
+
+	wr_reg16(info, SSR, status); /* clear pending */
+
+	info->irq_occurred = 1;
+
+	if (info->params.mode == MGSL_MODE_ASYNC) {
+		if (status & IRQ_TXIDLE) {
+			if (info->tx_count)
+				isr_txeom(info, status);
+		}
+		if ((status & IRQ_RXBREAK) && (status & RXBREAK)) {
+			info->icount.brk++;
+			/* process break detection if tty control allows */
+			if (info->tty) {
+				if (!(status & info->ignore_status_mask)) {
+					if (info->read_status_mask & MASK_BREAK) {
+						*info->tty->flip.flag_buf_ptr = TTY_BREAK;
+						if (info->flags & ASYNC_SAK)
+							do_SAK(info->tty);
+					}
+				}
+			}
+		}
+	} else {
+		if (status & (IRQ_TXIDLE + IRQ_TXUNDER))
+			isr_txeom(info, status);
+
+		if (status & IRQ_RXIDLE) {
+			if (status & RXIDLE)
+				info->icount.rxidle++;
+			else
+				info->icount.exithunt++;
+			wake_up_interruptible(&info->event_wait_q);
+		}
+
+		if (status & IRQ_RXOVER)
+			rx_start(info);
+	}
+
+	if (status & IRQ_DSR)
+		dsr_change(info);
+	if (status & IRQ_CTS)
+		cts_change(info);
+	if (status & IRQ_DCD)
+		dcd_change(info);
+	if (status & IRQ_RI)
+		ri_change(info);
+}
+
+static void isr_rdma(struct slgt_info *info)
+{
+	unsigned int status = rd_reg32(info, RDCSR);
+
+	DBGISR(("%s isr_rdma status=%08x\n", info->device_name, status));
+
+	/* RDCSR (rx DMA control/status)
+	 *
+	 * 31..07  reserved
+	 * 06      save status byte to DMA buffer
+	 * 05      error
+	 * 04      eol (end of list)
+	 * 03      eob (end of buffer)
+	 * 02      IRQ enable
+	 * 01      reset
+	 * 00      enable
+	 */
+	wr_reg32(info, RDCSR, status);	/* clear pending */
+
+	if (status & (BIT5 + BIT4)) {
+		DBGISR(("%s isr_rdma rx_restart=1\n", info->device_name));
+		info->rx_restart = 1;
+	}
+	info->pending_bh |= BH_RECEIVE;
+}
+
+static void isr_tdma(struct slgt_info *info)
+{
+	unsigned int status = rd_reg32(info, TDCSR);
+
+	DBGISR(("%s isr_tdma status=%08x\n", info->device_name, status));
+
+	/* TDCSR (tx DMA control/status)
+	 *
+	 * 31..06  reserved
+	 * 05      error
+	 * 04      eol (end of list)
+	 * 03      eob (end of buffer)
+	 * 02      IRQ enable
+	 * 01      reset
+	 * 00      enable
+	 */
+	wr_reg32(info, TDCSR, status);	/* clear pending */
+
+	if (status & (BIT5 + BIT4 + BIT3)) {
+		// another transmit buffer has completed
+		// run bottom half to get more send data from user
+		info->pending_bh |= BH_TRANSMIT;
+	}
+}
+
+static void isr_txeom(struct slgt_info *info, unsigned short status)
+{
+	DBGISR(("%s txeom status=%04x\n", info->device_name, status));
+
+	slgt_irq_off(info, IRQ_TXDATA + IRQ_TXIDLE + IRQ_TXUNDER);
+	tdma_reset(info);
+	reset_tbufs(info);
+	if (status & IRQ_TXUNDER) {
+		unsigned short val = rd_reg16(info, TCR);
+		wr_reg16(info, TCR, (unsigned short)(val | BIT2)); /* set reset bit */
+		wr_reg16(info, TCR, val); /* clear reset bit */
+	}
+
+	if (info->tx_active) {
+		if (info->params.mode != MGSL_MODE_ASYNC) {
+			if (status & IRQ_TXUNDER)
+				info->icount.txunder++;
+			else if (status & IRQ_TXIDLE)
+				info->icount.txok++;
+		}
+
+		info->tx_active = 0;
+		info->tx_count = 0;
+
+		del_timer(&info->tx_timer);
+
+		if (info->params.mode != MGSL_MODE_ASYNC && info->drop_rts_on_tx_done) {
+			info->signals &= ~SerialSignal_RTS;
+			info->drop_rts_on_tx_done = 0;
+			set_signals(info);
+		}
+
+#ifdef CONFIG_HDLC
+		if (info->netcount)
+			hdlcdev_tx_done(info);
+		else
+#endif
+		{
+			if (info->tty && (info->tty->stopped || info->tty->hw_stopped)) {
+				tx_stop(info);
+				return;
+			}
+			info->pending_bh |= BH_TRANSMIT;
+		}
+	}
+}
+
+/* interrupt service routine
+ *
+ * 	irq	interrupt number
+ * 	dev_id	device ID supplied during interrupt registration
+ * 	regs	interrupted processor context
+ */
+static irqreturn_t slgt_interrupt(int irq, void *dev_id, struct pt_regs * regs)
+{
+	struct slgt_info *info;
+	unsigned int gsr;
+	unsigned int i;
+
+	DBGISR(("slgt_interrupt irq=%d entry\n", irq));
+
+	info = dev_id;
+	if (!info)
+		return IRQ_NONE;
+
+	spin_lock(&info->lock);
+
+	while((gsr = rd_reg32(info, GSR) & 0xffffff00)) {
+		DBGISR(("%s gsr=%08x\n", info->device_name, gsr));
+		info->irq_occurred = 1;
+		for(i=0; i < info->port_count ; i++) {
+			if (info->port_array[i] == NULL)
+				continue;
+			if (gsr & (BIT8 << i))
+				isr_serial(info->port_array[i]);
+			if (gsr & (BIT16 << (i*2)))
+				isr_rdma(info->port_array[i]);
+			if (gsr & (BIT17 << (i*2)))
+				isr_tdma(info->port_array[i]);
+		}
+	}
+
+	for(i=0; i < info->port_count ; i++) {
+		struct slgt_info *port = info->port_array[i];
+
+		if (port && (port->count || port->netcount) &&
+		    port->pending_bh && !port->bh_running &&
+		    !port->bh_requested) {
+			DBGISR(("%s bh queued\n", port->device_name));
+			schedule_work(&port->task);
+			port->bh_requested = 1;
+		}
+	}
+
+	spin_unlock(&info->lock);
+
+	DBGISR(("slgt_interrupt irq=%d exit\n", irq));
+	return IRQ_HANDLED;
+}
+
+static int startup(struct slgt_info *info)
+{
+	DBGINFO(("%s startup\n", info->device_name));
+
+	if (info->flags & ASYNC_INITIALIZED)
+		return 0;
+
+	if (!info->tx_buf) {
+		info->tx_buf = kmalloc(info->max_frame_size, GFP_KERNEL);
+		if (!info->tx_buf) {
+			DBGERR(("%s can't allocate tx buffer\n", info->device_name));
+			return -ENOMEM;
+		}
+	}
+
+	info->pending_bh = 0;
+
+	memset(&info->icount, 0, sizeof(info->icount));
+
+	/* program hardware for current parameters */
+	change_params(info);
+
+	if (info->tty)
+		clear_bit(TTY_IO_ERROR, &info->tty->flags);
+
+	info->flags |= ASYNC_INITIALIZED;
+
+	return 0;
+}
+
+/*
+ *  called by close() and hangup() to shutdown hardware
+ */
+static void shutdown(struct slgt_info *info)
+{
+	unsigned long flags;
+
+	if (!(info->flags & ASYNC_INITIALIZED))
+		return;
+
+	DBGINFO(("%s shutdown\n", info->device_name));
+
+	/* clear status wait queue because status changes */
+	/* can't happen after shutting down the hardware */
+	wake_up_interruptible(&info->status_event_wait_q);
+	wake_up_interruptible(&info->event_wait_q);
+
+	del_timer_sync(&info->tx_timer);
+	del_timer_sync(&info->rx_timer);
+
+	kfree(info->tx_buf);
+	info->tx_buf = NULL;
+
+	spin_lock_irqsave(&info->lock,flags);
+
+	tx_stop(info);
+	rx_stop(info);
+
+	slgt_irq_off(info, IRQ_ALL | IRQ_MASTER);
+
+ 	if (!info->tty || info->tty->termios->c_cflag & HUPCL) {
+ 		info->signals &= ~(SerialSignal_DTR + SerialSignal_RTS);
+		set_signals(info);
+	}
+
+	spin_unlock_irqrestore(&info->lock,flags);
+
+	if (info->tty)
+		set_bit(TTY_IO_ERROR, &info->tty->flags);
+
+	info->flags &= ~ASYNC_INITIALIZED;
+}
+
+static void program_hw(struct slgt_info *info)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&info->lock,flags);
+
+	rx_stop(info);
+	tx_stop(info);
+
+	if (info->params.mode == MGSL_MODE_HDLC ||
+	    info->params.mode == MGSL_MODE_RAW ||
+	    info->netcount)
+		hdlc_mode(info);
+	else
+		async_mode(info);
+
+	set_signals(info);
+
+	info->dcd_chkcount = 0;
+	info->cts_chkcount = 0;
+	info->ri_chkcount = 0;
+	info->dsr_chkcount = 0;
+
+	slgt_irq_on(info, IRQ_DCD | IRQ_CTS | IRQ_DSR);
+	get_signals(info);
+
+	if (info->netcount ||
+	    (info->tty && info->tty->termios->c_cflag & CREAD))
+		rx_start(info);
+
+	spin_unlock_irqrestore(&info->lock,flags);
+}
+
+/*
+ * reconfigure adapter based on new parameters
+ */
+static void change_params(struct slgt_info *info)
+{
+	unsigned cflag;
+	int bits_per_char;
+
+	if (!info->tty || !info->tty->termios)
+		return;
+	DBGINFO(("%s change_params\n", info->device_name));
+
+	cflag = info->tty->termios->c_cflag;
+
+	/* if B0 rate (hangup) specified then negate DTR and RTS */
+	/* otherwise assert DTR and RTS */
+ 	if (cflag & CBAUD)
+		info->signals |= SerialSignal_RTS + SerialSignal_DTR;
+	else
+		info->signals &= ~(SerialSignal_RTS + SerialSignal_DTR);
+
+	/* byte size and parity */
+
+	switch (cflag & CSIZE) {
+	case CS5: info->params.data_bits = 5; break;
+	case CS6: info->params.data_bits = 6; break;
+	case CS7: info->params.data_bits = 7; break;
+	case CS8: info->params.data_bits = 8; break;
+	default:  info->params.data_bits = 7; break;
+	}
+
+	info->params.stop_bits = (cflag & CSTOPB) ? 2 : 1;
+
+	if (cflag & PARENB)
+		info->params.parity = (cflag & PARODD) ? ASYNC_PARITY_ODD : ASYNC_PARITY_EVEN;
+	else
+		info->params.parity = ASYNC_PARITY_NONE;
+
+	/* calculate number of jiffies to transmit a full
+	 * FIFO (32 bytes) at specified data rate
+	 */
+	bits_per_char = info->params.data_bits +
+			info->params.stop_bits + 1;
+
+	info->params.data_rate = tty_get_baud_rate(info->tty);
+
+	if (info->params.data_rate) {
+		info->timeout = (32*HZ*bits_per_char) /
+				info->params.data_rate;
+	}
+	info->timeout += HZ/50;		/* Add .02 seconds of slop */
+
+	if (cflag & CRTSCTS)
+		info->flags |= ASYNC_CTS_FLOW;
+	else
+		info->flags &= ~ASYNC_CTS_FLOW;
+
+	if (cflag & CLOCAL)
+		info->flags &= ~ASYNC_CHECK_CD;
+	else
+		info->flags |= ASYNC_CHECK_CD;
+
+	/* process tty input control flags */
+
+	info->read_status_mask = IRQ_RXOVER;
+	if (I_INPCK(info->tty))
+		info->read_status_mask |= MASK_PARITY | MASK_FRAMING;
+ 	if (I_BRKINT(info->tty) || I_PARMRK(info->tty))
+ 		info->read_status_mask |= MASK_BREAK;
+	if (I_IGNPAR(info->tty))
+		info->ignore_status_mask |= MASK_PARITY | MASK_FRAMING;
+	if (I_IGNBRK(info->tty)) {
+		info->ignore_status_mask |= MASK_BREAK;
+		/* If ignoring parity and break indicators, ignore
+		 * overruns too.  (For real raw support).
+		 */
+		if (I_IGNPAR(info->tty))
+			info->ignore_status_mask |= MASK_OVERRUN;
+	}
+
+	program_hw(info);
+}
+
+static int get_stats(struct slgt_info *info, struct mgsl_icount __user *user_icount)
+{
+	DBGINFO(("%s get_stats\n",  info->device_name));
+	if (!user_icount) {
+		memset(&info->icount, 0, sizeof(info->icount));
+	} else {
+		if (copy_to_user(user_icount, &info->icount, sizeof(struct mgsl_icount)))
+			return -EFAULT;
+	}
+	return 0;
+}
+
+static int get_params(struct slgt_info *info, MGSL_PARAMS __user *user_params)
+{
+	DBGINFO(("%s get_params\n", info->device_name));
+	if (copy_to_user(user_params, &info->params, sizeof(MGSL_PARAMS)))
+		return -EFAULT;
+	return 0;
+}
+
+static int set_params(struct slgt_info *info, MGSL_PARAMS __user *new_params)
+{
+ 	unsigned long flags;
+	MGSL_PARAMS tmp_params;
+
+	DBGINFO(("%s set_params\n", info->device_name));
+	if (copy_from_user(&tmp_params, new_params, sizeof(MGSL_PARAMS)))
+		return -EFAULT;
+
+	spin_lock_irqsave(&info->lock, flags);
+	memcpy(&info->params, &tmp_params, sizeof(MGSL_PARAMS));
+	spin_unlock_irqrestore(&info->lock, flags);
+
+ 	change_params(info);
+
+	return 0;
+}
+
+static int get_txidle(struct slgt_info *info, int __user *idle_mode)
+{
+	DBGINFO(("%s get_txidle=%d\n", info->device_name, info->idle_mode));
+	if (put_user(info->idle_mode, idle_mode))
+		return -EFAULT;
+	return 0;
+}
+
+static int set_txidle(struct slgt_info *info, int idle_mode)
+{
+ 	unsigned long flags;
+	DBGINFO(("%s set_txidle(%d)\n", info->device_name, idle_mode));
+	spin_lock_irqsave(&info->lock,flags);
+	info->idle_mode = idle_mode;
+	tx_set_idle(info);
+	spin_unlock_irqrestore(&info->lock,flags);
+	return 0;
+}
+
+static int tx_enable(struct slgt_info *info, int enable)
+{
+ 	unsigned long flags;
+	DBGINFO(("%s tx_enable(%d)\n", info->device_name, enable));
+	spin_lock_irqsave(&info->lock,flags);
+	if (enable) {
+		if (!info->tx_enabled)
+			tx_start(info);
+	} else {
+		if (info->tx_enabled)
+			tx_stop(info);
+	}
+	spin_unlock_irqrestore(&info->lock,flags);
+	return 0;
+}
+
+/*
+ * abort transmit HDLC frame
+ */
+static int tx_abort(struct slgt_info *info)
+{
+ 	unsigned long flags;
+	DBGINFO(("%s tx_abort\n", info->device_name));
+	spin_lock_irqsave(&info->lock,flags);
+	tdma_reset(info);
+	spin_unlock_irqrestore(&info->lock,flags);
+	return 0;
+}
+
+static int rx_enable(struct slgt_info *info, int enable)
+{
+ 	unsigned long flags;
+	DBGINFO(("%s rx_enable(%d)\n", info->device_name, enable));
+	spin_lock_irqsave(&info->lock,flags);
+	if (enable) {
+		if (!info->rx_enabled)
+			rx_start(info);
+	} else {
+		if (info->rx_enabled)
+			rx_stop(info);
+	}
+	spin_unlock_irqrestore(&info->lock,flags);
+	return 0;
+}
+
+/*
+ *  wait for specified event to occur
+ */
+static int wait_mgsl_event(struct slgt_info *info, int __user *mask_ptr)
+{
+ 	unsigned long flags;
+	int s;
+	int rc=0;
+	struct mgsl_icount cprev, cnow;
+	int events;
+	int mask;
+	struct	_input_signal_events oldsigs, newsigs;
+	DECLARE_WAITQUEUE(wait, current);
+
+	if (get_user(mask, mask_ptr))
+		return -EFAULT;
+
+	DBGINFO(("%s wait_mgsl_event(%d)\n", info->device_name, mask));
+
+	spin_lock_irqsave(&info->lock,flags);
+
+	/* return immediately if state matches requested events */
+	get_signals(info);
+	s = info->signals;
+
+	events = mask &
+		( ((s & SerialSignal_DSR) ? MgslEvent_DsrActive:MgslEvent_DsrInactive) +
+ 		  ((s & SerialSignal_DCD) ? MgslEvent_DcdActive:MgslEvent_DcdInactive) +
+		  ((s & SerialSignal_CTS) ? MgslEvent_CtsActive:MgslEvent_CtsInactive) +
+		  ((s & SerialSignal_RI)  ? MgslEvent_RiActive :MgslEvent_RiInactive) );
+	if (events) {
+		spin_unlock_irqrestore(&info->lock,flags);
+		goto exit;
+	}
+
+	/* save current irq counts */
+	cprev = info->icount;
+	oldsigs = info->input_signal_events;
+
+	/* enable hunt and idle irqs if needed */
+	if (mask & (MgslEvent_ExitHuntMode+MgslEvent_IdleReceived)) {
+		unsigned short val = rd_reg16(info, SCR);
+		if (!(val & IRQ_RXIDLE))
+			wr_reg16(info, SCR, (unsigned short)(val | IRQ_RXIDLE));
+	}
+
+	set_current_state(TASK_INTERRUPTIBLE);
+	add_wait_queue(&info->event_wait_q, &wait);
+
+	spin_unlock_irqrestore(&info->lock,flags);
+
+	for(;;) {
+		schedule();
+		if (signal_pending(current)) {
+			rc = -ERESTARTSYS;
+			break;
+		}
+
+		/* get current irq counts */
+		spin_lock_irqsave(&info->lock,flags);
+		cnow = info->icount;
+		newsigs = info->input_signal_events;
+		set_current_state(TASK_INTERRUPTIBLE);
+		spin_unlock_irqrestore(&info->lock,flags);
+
+		/* if no change, wait aborted for some reason */
+		if (newsigs.dsr_up   == oldsigs.dsr_up   &&
+		    newsigs.dsr_down == oldsigs.dsr_down &&
+		    newsigs.dcd_up   == oldsigs.dcd_up   &&
+		    newsigs.dcd_down == oldsigs.dcd_down &&
+		    newsigs.cts_up   == oldsigs.cts_up   &&
+		    newsigs.cts_down == oldsigs.cts_down &&
+		    newsigs.ri_up    == oldsigs.ri_up    &&
+		    newsigs.ri_down  == oldsigs.ri_down  &&
+		    cnow.exithunt    == cprev.exithunt   &&
+		    cnow.rxidle      == cprev.rxidle) {
+			rc = -EIO;
+			break;
+		}
+
+		events = mask &
+			( (newsigs.dsr_up   != oldsigs.dsr_up   ? MgslEvent_DsrActive:0)   +
+			  (newsigs.dsr_down != oldsigs.dsr_down ? MgslEvent_DsrInactive:0) +
+			  (newsigs.dcd_up   != oldsigs.dcd_up   ? MgslEvent_DcdActive:0)   +
+			  (newsigs.dcd_down != oldsigs.dcd_down ? MgslEvent_DcdInactive:0) +
+			  (newsigs.cts_up   != oldsigs.cts_up   ? MgslEvent_CtsActive:0)   +
+			  (newsigs.cts_down != oldsigs.cts_down ? MgslEvent_CtsInactive:0) +
+			  (newsigs.ri_up    != oldsigs.ri_up    ? MgslEvent_RiActive:0)    +
+			  (newsigs.ri_down  != oldsigs.ri_down  ? MgslEvent_RiInactive:0)  +
+			  (cnow.exithunt    != cprev.exithunt   ? MgslEvent_ExitHuntMode:0) +
+			  (cnow.rxidle      != cprev.rxidle     ? MgslEvent_IdleReceived:0) );
+		if (events)
+			break;
+
+		cprev = cnow;
+		oldsigs = newsigs;
+	}
+
+	remove_wait_queue(&info->event_wait_q, &wait);
+	set_current_state(TASK_RUNNING);
+
+
+	if (mask & (MgslEvent_ExitHuntMode + MgslEvent_IdleReceived)) {
+		spin_lock_irqsave(&info->lock,flags);
+		if (!waitqueue_active(&info->event_wait_q)) {
+			/* disable enable exit hunt mode/idle rcvd IRQs */
+			wr_reg16(info, SCR,
+				(unsigned short)(rd_reg16(info, SCR) & ~IRQ_RXIDLE));
+		}
+		spin_unlock_irqrestore(&info->lock,flags);
+	}
+exit:
+	if (rc == 0)
+		rc = put_user(events, mask_ptr);
+	return rc;
+}
+
+static int get_interface(struct slgt_info *info, int __user *if_mode)
+{
+	DBGINFO(("%s get_interface=%x\n", info->device_name, info->if_mode));
+	if (put_user(info->if_mode, if_mode))
+		return -EFAULT;
+	return 0;
+}
+
+static int set_interface(struct slgt_info *info, int if_mode)
+{
+ 	unsigned long flags;
+	unsigned char val;
+
+	DBGINFO(("%s set_interface=%x)\n", info->device_name, if_mode));
+	spin_lock_irqsave(&info->lock,flags);
+	info->if_mode = if_mode;
+
+	msc_set_vcr(info);
+
+	/* TCR (tx control) 07  1=RTS driver control */
+	val = rd_reg16(info, TCR);
+	if (info->if_mode & MGSL_INTERFACE_RTS_EN)
+		val |= BIT7;
+	else
+		val &= ~BIT7;
+	wr_reg16(info, TCR, val);
+
+	spin_unlock_irqrestore(&info->lock,flags);
+	return 0;
+}
+
+static int modem_input_wait(struct slgt_info *info,int arg)
+{
+ 	unsigned long flags;
+	int rc;
+	struct mgsl_icount cprev, cnow;
+	DECLARE_WAITQUEUE(wait, current);
+
+	/* save current irq counts */
+	spin_lock_irqsave(&info->lock,flags);
+	cprev = info->icount;
+	add_wait_queue(&info->status_event_wait_q, &wait);
+	set_current_state(TASK_INTERRUPTIBLE);
+	spin_unlock_irqrestore(&info->lock,flags);
+
+	for(;;) {
+		schedule();
+		if (signal_pending(current)) {
+			rc = -ERESTARTSYS;
+			break;
+		}
+
+		/* get new irq counts */
+		spin_lock_irqsave(&info->lock,flags);
+		cnow = info->icount;
+		set_current_state(TASK_INTERRUPTIBLE);
+		spin_unlock_irqrestore(&info->lock,flags);
+
+		/* if no change, wait aborted for some reason */
+		if (cnow.rng == cprev.rng && cnow.dsr == cprev.dsr &&
+		    cnow.dcd == cprev.dcd && cnow.cts == cprev.cts) {
+			rc = -EIO;
+			break;
+		}
+
+		/* check for change in caller specified modem input */
+		if ((arg & TIOCM_RNG && cnow.rng != cprev.rng) ||
+		    (arg & TIOCM_DSR && cnow.dsr != cprev.dsr) ||
+		    (arg & TIOCM_CD  && cnow.dcd != cprev.dcd) ||
+		    (arg & TIOCM_CTS && cnow.cts != cprev.cts)) {
+			rc = 0;
+			break;
+		}
+
+		cprev = cnow;
+	}
+	remove_wait_queue(&info->status_event_wait_q, &wait);
+	set_current_state(TASK_RUNNING);
+	return rc;
+}
+
+/*
+ *  return state of serial control and status signals
+ */
+static int tiocmget(struct tty_struct *tty, struct file *file)
+{
+	struct slgt_info *info = tty->driver_data;
+	unsigned int result;
+ 	unsigned long flags;
+
+	spin_lock_irqsave(&info->lock,flags);
+ 	get_signals(info);
+	spin_unlock_irqrestore(&info->lock,flags);
+
+	result = ((info->signals & SerialSignal_RTS) ? TIOCM_RTS:0) +
+		((info->signals & SerialSignal_DTR) ? TIOCM_DTR:0) +
+		((info->signals & SerialSignal_DCD) ? TIOCM_CAR:0) +
+		((info->signals & SerialSignal_RI)  ? TIOCM_RNG:0) +
+		((info->signals & SerialSignal_DSR) ? TIOCM_DSR:0) +
+		((info->signals & SerialSignal_CTS) ? TIOCM_CTS:0);
+
+	DBGINFO(("%s tiocmget value=%08X\n", info->device_name, result));
+	return result;
+}
+
+/*
+ * set modem control signals (DTR/RTS)
+ *
+ * 	cmd	signal command: TIOCMBIS = set bit TIOCMBIC = clear bit
+ *		TIOCMSET = set/clear signal values
+ * 	value	bit mask for command
+ */
+static int tiocmset(struct tty_struct *tty, struct file *file,
+		    unsigned int set, unsigned int clear)
+{
+	struct slgt_info *info = tty->driver_data;
+ 	unsigned long flags;
+
+	DBGINFO(("%s tiocmset(%x,%x)\n", info->device_name, set, clear));
+
+	if (set & TIOCM_RTS)
+		info->signals |= SerialSignal_RTS;
+	if (set & TIOCM_DTR)
+		info->signals |= SerialSignal_DTR;
+	if (clear & TIOCM_RTS)
+		info->signals &= ~SerialSignal_RTS;
+	if (clear & TIOCM_DTR)
+		info->signals &= ~SerialSignal_DTR;
+
+	spin_lock_irqsave(&info->lock,flags);
+ 	set_signals(info);
+	spin_unlock_irqrestore(&info->lock,flags);
+	return 0;
+}
+
+/*
+ *  block current process until the device is ready to open
+ */
+static int block_til_ready(struct tty_struct *tty, struct file *filp,
+			   struct slgt_info *info)
+{
+	DECLARE_WAITQUEUE(wait, current);
+	int		retval;
+	int		do_clocal = 0, extra_count = 0;
+	unsigned long	flags;
+
+	DBGINFO(("%s block_til_ready\n", tty->driver->name));
+
+	if (filp->f_flags & O_NONBLOCK || tty->flags & (1 << TTY_IO_ERROR)){
+		/* nonblock mode is set or port is not enabled */
+		info->flags |= ASYNC_NORMAL_ACTIVE;
+		return 0;
+	}
+
+	if (tty->termios->c_cflag & CLOCAL)
+		do_clocal = 1;
+
+	/* Wait for carrier detect and the line to become
+	 * free (i.e., not in use by the callout).  While we are in
+	 * this loop, info->count is dropped by one, so that
+	 * close() knows when to free things.  We restore it upon
+	 * exit, either normal or abnormal.
+	 */
+
+	retval = 0;
+	add_wait_queue(&info->open_wait, &wait);
+
+	spin_lock_irqsave(&info->lock, flags);
+	if (!tty_hung_up_p(filp)) {
+		extra_count = 1;
+		info->count--;
+	}
+	spin_unlock_irqrestore(&info->lock, flags);
+	info->blocked_open++;
+
+	while (1) {
+		if ((tty->termios->c_cflag & CBAUD)) {
+			spin_lock_irqsave(&info->lock,flags);
+			info->signals |= SerialSignal_RTS + SerialSignal_DTR;
+		 	set_signals(info);
+			spin_unlock_irqrestore(&info->lock,flags);
+		}
+
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		if (tty_hung_up_p(filp) || !(info->flags & ASYNC_INITIALIZED)){
+			retval = (info->flags & ASYNC_HUP_NOTIFY) ?
+					-EAGAIN : -ERESTARTSYS;
+			break;
+		}
+
+		spin_lock_irqsave(&info->lock,flags);
+	 	get_signals(info);
+		spin_unlock_irqrestore(&info->lock,flags);
+
+ 		if (!(info->flags & ASYNC_CLOSING) &&
+ 		    (do_clocal || (info->signals & SerialSignal_DCD)) ) {
+ 			break;
+		}
+
+		if (signal_pending(current)) {
+			retval = -ERESTARTSYS;
+			break;
+		}
+
+		DBGINFO(("%s block_til_ready wait\n", tty->driver->name));
+		schedule();
+	}
+
+	set_current_state(TASK_RUNNING);
+	remove_wait_queue(&info->open_wait, &wait);
+
+	if (extra_count)
+		info->count++;
+	info->blocked_open--;
+
+	if (!retval)
+		info->flags |= ASYNC_NORMAL_ACTIVE;
+
+	DBGINFO(("%s block_til_ready ready, rc=%d\n", tty->driver->name, retval));
+	return retval;
+}
+
+static int alloc_tmp_rbuf(struct slgt_info *info)
+{
+	info->tmp_rbuf = kmalloc(info->max_frame_size, GFP_KERNEL);
+	if (info->tmp_rbuf == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+static void free_tmp_rbuf(struct slgt_info *info)
+{
+	kfree(info->tmp_rbuf);
+	info->tmp_rbuf = NULL;
+}
+
+/*
+ * allocate DMA descriptor lists.
+ */
+static int alloc_desc(struct slgt_info *info)
+{
+	unsigned int i;
+	unsigned int pbufs;
+
+	/* allocate memory to hold descriptor lists */
+	info->bufs = pci_alloc_consistent(info->pdev, DESC_LIST_SIZE, &info->bufs_dma_addr);
+	if (info->bufs == NULL)
+		return -ENOMEM;
+
+	memset(info->bufs, 0, DESC_LIST_SIZE);
+
+	info->rbufs = (struct slgt_desc*)info->bufs;
+	info->tbufs = ((struct slgt_desc*)info->bufs) + info->rbuf_count;
+
+	pbufs = (unsigned int)info->bufs_dma_addr;
+
+	/*
+	 * Build circular lists of descriptors
+	 */
+
+	for (i=0; i < info->rbuf_count; i++) {
+		/* physical address of this descriptor */
+		info->rbufs[i].pdesc = pbufs + (i * sizeof(struct slgt_desc));
+
+		/* physical address of next descriptor */
+		if (i == info->rbuf_count - 1)
+			info->rbufs[i].next = cpu_to_le32(pbufs);
+		else
+			info->rbufs[i].next = cpu_to_le32(pbufs + ((i+1) * sizeof(struct slgt_desc)));
+		set_desc_count(info->rbufs[i], DMABUFSIZE);
+	}
+
+	for (i=0; i < info->tbuf_count; i++) {
+		/* physical address of this descriptor */
+		info->tbufs[i].pdesc = pbufs + ((info->rbuf_count + i) * sizeof(struct slgt_desc));
+
+		/* physical address of next descriptor */
+		if (i == info->tbuf_count - 1)
+			info->tbufs[i].next = cpu_to_le32(pbufs + info->rbuf_count * sizeof(struct slgt_desc));
+		else
+			info->tbufs[i].next = cpu_to_le32(pbufs + ((info->rbuf_count + i + 1) * sizeof(struct slgt_desc)));
+	}
+
+	return 0;
+}
+
+static void free_desc(struct slgt_info *info)
+{
+	if (info->bufs != NULL) {
+		pci_free_consistent(info->pdev, DESC_LIST_SIZE, info->bufs, info->bufs_dma_addr);
+		info->bufs  = NULL;
+		info->rbufs = NULL;
+		info->tbufs = NULL;
+	}
+}
+
+static int alloc_bufs(struct slgt_info *info, struct slgt_desc *bufs, int count)
+{
+	int i;
+	for (i=0; i < count; i++) {
+		if ((bufs[i].buf = pci_alloc_consistent(info->pdev, DMABUFSIZE, &bufs[i].buf_dma_addr)) == NULL)
+			return -ENOMEM;
+		bufs[i].pbuf  = cpu_to_le32((unsigned int)bufs[i].buf_dma_addr);
+	}
+	return 0;
+}
+
+static void free_bufs(struct slgt_info *info, struct slgt_desc *bufs, int count)
+{
+	int i;
+	for (i=0; i < count; i++) {
+		if (bufs[i].buf == NULL)
+			continue;
+		pci_free_consistent(info->pdev, DMABUFSIZE, bufs[i].buf, bufs[i].buf_dma_addr);
+		bufs[i].buf = NULL;
+	}
+}
+
+static int alloc_dma_bufs(struct slgt_info *info)
+{
+	info->rbuf_count = 32;
+	info->tbuf_count = 32;
+
+	if (alloc_desc(info) < 0 ||
+	    alloc_bufs(info, info->rbufs, info->rbuf_count) < 0 ||
+	    alloc_bufs(info, info->tbufs, info->tbuf_count) < 0 ||
+	    alloc_tmp_rbuf(info) < 0) {
+		DBGERR(("%s DMA buffer alloc fail\n", info->device_name));
+		return -ENOMEM;
+	}
+	reset_rbufs(info);
+	return 0;
+}
+
+static void free_dma_bufs(struct slgt_info *info)
+{
+	if (info->bufs) {
+		free_bufs(info, info->rbufs, info->rbuf_count);
+		free_bufs(info, info->tbufs, info->tbuf_count);
+		free_desc(info);
+	}
+	free_tmp_rbuf(info);
+}
+
+static int claim_resources(struct slgt_info *info)
+{
+	if (request_mem_region(info->phys_reg_addr, SLGT_REG_SIZE, "synclink_gt") == NULL) {
+		DBGERR(("%s reg addr conflict, addr=%08X\n",
+			info->device_name, info->phys_reg_addr));
+		info->init_error = DiagStatus_AddressConflict;
+		goto errout;
+	}
+	else
+		info->reg_addr_requested = 1;
+
+	info->reg_addr = ioremap(info->phys_reg_addr, PAGE_SIZE);
+	if (!info->reg_addr) {
+		DBGERR(("%s cant map device registers, addr=%08X\n",
+			info->device_name, info->phys_reg_addr));
+		info->init_error = DiagStatus_CantAssignPciResources;
+		goto errout;
+	}
+	info->reg_addr += info->reg_offset;
+	return 0;
+
+errout:
+	release_resources(info);
+	return -ENODEV;
+}
+
+static void release_resources(struct slgt_info *info)
+{
+	if (info->irq_requested) {
+		free_irq(info->irq_level, info);
+		info->irq_requested = 0;
+	}
+
+	if (info->reg_addr_requested) {
+		release_mem_region(info->phys_reg_addr, SLGT_REG_SIZE);
+		info->reg_addr_requested = 0;
+	}
+
+	if (info->reg_addr) {
+		iounmap(info->reg_addr - info->reg_offset);
+		info->reg_addr = NULL;
+	}
+}
+
+/* Add the specified device instance data structure to the
+ * global linked list of devices and increment the device count.
+ */
+static void add_device(struct slgt_info *info)
+{
+	char *devstr;
+
+	info->next_device = NULL;
+	info->line = slgt_device_count;
+	sprintf(info->device_name, "%s%d", tty_dev_prefix, info->line);
+
+	if (info->line < MAX_DEVICES) {
+		if (maxframe[info->line])
+			info->max_frame_size = maxframe[info->line];
+		info->dosyncppp = dosyncppp[info->line];
+	}
+
+	slgt_device_count++;
+
+	if (!slgt_device_list)
+		slgt_device_list = info;
+	else {
+		struct slgt_info *current_dev = slgt_device_list;
+		while(current_dev->next_device)
+			current_dev = current_dev->next_device;
+		current_dev->next_device = info;
+	}
+
+	if (info->max_frame_size < 4096)
+		info->max_frame_size = 4096;
+	else if (info->max_frame_size > 65535)
+		info->max_frame_size = 65535;
+
+	switch(info->pdev->device) {
+	case SYNCLINK_GT_DEVICE_ID:
+		devstr = "GT";
+		break;
+	case SYNCLINK_GT4_DEVICE_ID:
+		devstr = "GT4";
+		break;
+	case SYNCLINK_AC_DEVICE_ID:
+		devstr = "AC";
+		info->params.mode = MGSL_MODE_ASYNC;
+		break;
+	default:
+		devstr = "(unknown model)";
+	}
+	printk("SyncLink %s %s IO=%08x IRQ=%d MaxFrameSize=%u\n",
+		devstr, info->device_name, info->phys_reg_addr,
+		info->irq_level, info->max_frame_size);
+
+#ifdef CONFIG_HDLC
+	hdlcdev_init(info);
+#endif
+}
+
+/*
+ *  allocate device instance structure, return NULL on failure
+ */
+static struct slgt_info *alloc_dev(int adapter_num, int port_num, struct pci_dev *pdev)
+{
+	struct slgt_info *info;
+
+	info = kmalloc(sizeof(struct slgt_info), GFP_KERNEL);
+
+	if (!info) {
+		DBGERR(("%s device alloc failed adapter=%d port=%d\n",
+			driver_name, adapter_num, port_num));
+	} else {
+		memset(info, 0, sizeof(struct slgt_info));
+		info->magic = MGSL_MAGIC;
+		INIT_WORK(&info->task, bh_handler, info);
+		info->max_frame_size = 4096;
+		info->raw_rx_size = DMABUFSIZE;
+		info->close_delay = 5*HZ/10;
+		info->closing_wait = 30*HZ;
+		init_waitqueue_head(&info->open_wait);
+		init_waitqueue_head(&info->close_wait);
+		init_waitqueue_head(&info->status_event_wait_q);
+		init_waitqueue_head(&info->event_wait_q);
+		spin_lock_init(&info->netlock);
+		memcpy(&info->params,&default_params,sizeof(MGSL_PARAMS));
+		info->idle_mode = HDLC_TXIDLE_FLAGS;
+		info->adapter_num = adapter_num;
+		info->port_num = port_num;
+
+		init_timer(&info->tx_timer);
+		info->tx_timer.data = (unsigned long)info;
+		info->tx_timer.function = tx_timeout;
+
+		init_timer(&info->rx_timer);
+		info->rx_timer.data = (unsigned long)info;
+		info->rx_timer.function = rx_timeout;
+
+		/* Copy configuration info to device instance data */
+		info->pdev = pdev;
+		info->irq_level = pdev->irq;
+		info->phys_reg_addr = pci_resource_start(pdev,0);
+
+		/* veremap works on page boundaries
+		 * map full page starting at the page boundary
+		 */
+		info->reg_offset    = info->phys_reg_addr & (PAGE_SIZE-1);
+		info->phys_reg_addr &= ~(PAGE_SIZE-1);
+
+		info->bus_type = MGSL_BUS_TYPE_PCI;
+		info->irq_flags = SA_SHIRQ;
+
+		info->init_error = -1; /* assume error, set to 0 on successful init */
+	}
+
+	return info;
+}
+
+static void device_init(int adapter_num, struct pci_dev *pdev)
+{
+	struct slgt_info *port_array[SLGT_MAX_PORTS];
+	int i;
+	int port_count = 1;
+
+	if (pdev->device == SYNCLINK_GT4_DEVICE_ID)
+		port_count = 4;
+
+	/* allocate device instances for all ports */
+	for (i=0; i < port_count; ++i) {
+		port_array[i] = alloc_dev(adapter_num, i, pdev);
+		if (port_array[i] == NULL) {
+			for (--i; i >= 0; --i)
+				kfree(port_array[i]);
+			return;
+		}
+	}
+
+	/* give copy of port_array to all ports and add to device list  */
+	for (i=0; i < port_count; ++i) {
+		memcpy(port_array[i]->port_array, port_array, sizeof(port_array));
+		add_device(port_array[i]);
+		port_array[i]->port_count = port_count;
+		spin_lock_init(&port_array[i]->lock);
+	}
+
+	/* Allocate and claim adapter resources */
+	if (!claim_resources(port_array[0])) {
+
+		alloc_dma_bufs(port_array[0]);
+
+		/* copy resource information from first port to others */
+		for (i = 1; i < port_count; ++i) {
+			port_array[i]->lock      = port_array[0]->lock;
+			port_array[i]->irq_level = port_array[0]->irq_level;
+			port_array[i]->reg_addr  = port_array[0]->reg_addr;
+			alloc_dma_bufs(port_array[i]);
+		}
+
+		if (request_irq(port_array[0]->irq_level,
+					slgt_interrupt,
+					port_array[0]->irq_flags,
+					port_array[0]->device_name,
+					port_array[0]) < 0) {
+			DBGERR(("%s request_irq failed IRQ=%d\n",
+				port_array[0]->device_name,
+				port_array[0]->irq_level));
+		} else {
+			port_array[0]->irq_requested = 1;
+			adapter_test(port_array[0]);
+			for (i=1 ; i < port_count ; i++)
+				port_array[i]->init_error = port_array[0]->init_error;
+		}
+	}
+}
+
+static int __devinit init_one(struct pci_dev *dev,
+			      const struct pci_device_id *ent)
+{
+	if (pci_enable_device(dev)) {
+		printk("error enabling pci device %p\n", dev);
+		return -EIO;
+	}
+	pci_set_master(dev);
+	device_init(slgt_device_count, dev);
+	return 0;
+}
+
+static void __devexit remove_one(struct pci_dev *dev)
+{
+}
+
+static struct tty_operations ops = {
+	.open = open,
+	.close = close,
+	.write = write,
+	.put_char = put_char,
+	.flush_chars = flush_chars,
+	.write_room = write_room,
+	.chars_in_buffer = chars_in_buffer,
+	.flush_buffer = flush_buffer,
+	.ioctl = ioctl,
+	.throttle = throttle,
+	.unthrottle = unthrottle,
+	.send_xchar = send_xchar,
+	.break_ctl = set_break,
+	.wait_until_sent = wait_until_sent,
+ 	.read_proc = read_proc,
+	.set_termios = set_termios,
+	.stop = tx_hold,
+	.start = tx_release,
+	.hangup = hangup,
+	.tiocmget = tiocmget,
+	.tiocmset = tiocmset,
+};
+
+static void slgt_cleanup(void)
+{
+	int rc;
+	struct slgt_info *info;
+	struct slgt_info *tmp;
+
+	printk("unload %s %s\n", driver_name, driver_version);
+
+	if (serial_driver) {
+		if ((rc = tty_unregister_driver(serial_driver)))
+			DBGERR(("tty_unregister_driver error=%d\n", rc));
+		put_tty_driver(serial_driver);
+	}
+
+	/* reset devices */
+	info = slgt_device_list;
+	while(info) {
+		reset_port(info);
+		info = info->next_device;
+	}
+
+	/* release devices */
+	info = slgt_device_list;
+	while(info) {
+#ifdef CONFIG_HDLC
+		hdlcdev_exit(info);
+#endif
+		free_dma_bufs(info);
+		free_tmp_rbuf(info);
+		if (info->port_num == 0)
+			release_resources(info);
+		tmp = info;
+		info = info->next_device;
+		kfree(tmp);
+	}
+
+	if (pci_registered)
+		pci_unregister_driver(&pci_driver);
+}
+
+/*
+ *  Driver initialization entry point.
+ */
+static int __init slgt_init(void)
+{
+	int rc;
+
+ 	printk("%s %s\n", driver_name, driver_version);
+
+	slgt_device_count = 0;
+	if ((rc = pci_register_driver(&pci_driver)) < 0) {
+		printk("%s pci_register_driver error=%d\n", driver_name, rc);
+		return rc;
+	}
+	pci_registered = 1;
+
+	if (!slgt_device_list) {
+		printk("%s no devices found\n",driver_name);
+		return -ENODEV;
+	}
+
+	serial_driver = alloc_tty_driver(MAX_DEVICES);
+	if (!serial_driver) {
+		rc = -ENOMEM;
+		goto error;
+	}
+
+	/* Initialize the tty_driver structure */
+
+	serial_driver->owner = THIS_MODULE;
+	serial_driver->driver_name = tty_driver_name;
+	serial_driver->name = tty_dev_prefix;
+	serial_driver->major = ttymajor;
+	serial_driver->minor_start = 64;
+	serial_driver->type = TTY_DRIVER_TYPE_SERIAL;
+	serial_driver->subtype = SERIAL_TYPE_NORMAL;
+	serial_driver->init_termios = tty_std_termios;
+	serial_driver->init_termios.c_cflag =
+		B9600 | CS8 | CREAD | HUPCL | CLOCAL;
+	serial_driver->flags = TTY_DRIVER_REAL_RAW;
+	tty_set_operations(serial_driver, &ops);
+	if ((rc = tty_register_driver(serial_driver)) < 0) {
+		DBGERR(("%s can't register serial driver\n", driver_name));
+		put_tty_driver(serial_driver);
+		serial_driver = NULL;
+		goto error;
+	}
+
+ 	printk("%s %s, tty major#%d\n",
+		driver_name, driver_version,
+		serial_driver->major);
+
+	return 0;
+
+error:
+	slgt_cleanup();
+	return rc;
+}
+
+static void __exit slgt_exit(void)
+{
+	slgt_cleanup();
+}
+
+module_init(slgt_init);
+module_exit(slgt_exit);
+
+/*
+ * register access routines
+ */
+
+#define CALC_REGADDR() \
+	unsigned long reg_addr = ((unsigned long)info->reg_addr) + addr; \
+	if (addr >= 0x80) \
+		reg_addr += (info->port_num) * 32;
+
+static __u8 rd_reg8(struct slgt_info *info, unsigned int addr)
+{
+	CALC_REGADDR();
+	return readb((void __iomem *)reg_addr);
+}
+
+static void wr_reg8(struct slgt_info *info, unsigned int addr, __u8 value)
+{
+	CALC_REGADDR();
+	writeb(value, (void __iomem *)reg_addr);
+}
+
+static __u16 rd_reg16(struct slgt_info *info, unsigned int addr)
+{
+	CALC_REGADDR();
+	return readw((void __iomem *)reg_addr);
+}
+
+static void wr_reg16(struct slgt_info *info, unsigned int addr, __u16 value)
+{
+	CALC_REGADDR();
+	writew(value, (void __iomem *)reg_addr);
+}
+
+static __u32 rd_reg32(struct slgt_info *info, unsigned int addr)
+{
+	CALC_REGADDR();
+	return readl((void __iomem *)reg_addr);
+}
+
+static void wr_reg32(struct slgt_info *info, unsigned int addr, __u32 value)
+{
+	CALC_REGADDR();
+	writel(value, (void __iomem *)reg_addr);
+}
+
+static void rdma_reset(struct slgt_info *info)
+{
+	unsigned int i;
+
+	/* set reset bit */
+	wr_reg32(info, RDCSR, BIT1);
+
+	/* wait for enable bit cleared */
+	for(i=0 ; i < 1000 ; i++)
+		if (!(rd_reg32(info, RDCSR) & BIT0))
+			break;
+}
+
+static void tdma_reset(struct slgt_info *info)
+{
+	unsigned int i;
+
+	/* set reset bit */
+	wr_reg32(info, TDCSR, BIT1);
+
+	/* wait for enable bit cleared */
+	for(i=0 ; i < 1000 ; i++)
+		if (!(rd_reg32(info, TDCSR) & BIT0))
+			break;
+}
+
+/*
+ * enable internal loopback
+ * TxCLK and RxCLK are generated from BRG
+ * and TxD is looped back to RxD internally.
+ */
+static void enable_loopback(struct slgt_info *info)
+{
+	/* SCR (serial control) BIT2=looopback enable */
+	wr_reg16(info, SCR, (unsigned short)(rd_reg16(info, SCR) | BIT2));
+
+	if (info->params.mode != MGSL_MODE_ASYNC) {
+		/* CCR (clock control)
+		 * 07..05  tx clock source (010 = BRG)
+		 * 04..02  rx clock source (010 = BRG)
+		 * 01      auxclk enable   (0 = disable)
+		 * 00      BRG enable      (1 = enable)
+		 *
+		 * 0100 1001
+		 */
+		wr_reg8(info, CCR, 0x49);
+
+		/* set speed if available, otherwise use default */
+		if (info->params.clock_speed)
+			set_rate(info, info->params.clock_speed);
+		else
+			set_rate(info, 3686400);
+	}
+}
+
+/*
+ *  set baud rate generator to specified rate
+ */
+static void set_rate(struct slgt_info *info, u32 rate)
+{
+	unsigned int div;
+	static unsigned int osc = 14745600;
+
+	/* div = osc/rate - 1
+	 *
+	 * Round div up if osc/rate is not integer to
+	 * force to next slowest rate.
+	 */
+
+	if (rate) {
+		div = osc/rate;
+		if (!(osc % rate) && div)
+			div--;
+		wr_reg16(info, BDR, (unsigned short)div);
+	}
+}
+
+static void rx_stop(struct slgt_info *info)
+{
+	unsigned short val;
+
+	/* disable and reset receiver */
+	val = rd_reg16(info, RCR) & ~BIT1;          /* clear enable bit */
+	wr_reg16(info, RCR, (unsigned short)(val | BIT2)); /* set reset bit */
+	wr_reg16(info, RCR, val);                  /* clear reset bit */
+
+	slgt_irq_off(info, IRQ_RXOVER + IRQ_RXDATA + IRQ_RXIDLE);
+
+	/* clear pending rx interrupts */
+	wr_reg16(info, SSR, IRQ_RXIDLE + IRQ_RXOVER);
+
+	rdma_reset(info);
+
+	info->rx_enabled = 0;
+	info->rx_restart = 0;
+}
+
+static void rx_start(struct slgt_info *info)
+{
+	unsigned short val;
+
+	slgt_irq_off(info, IRQ_RXOVER + IRQ_RXDATA);
+
+	/* clear pending rx overrun IRQ */
+	wr_reg16(info, SSR, IRQ_RXOVER);
+
+	/* reset and disable receiver */
+	val = rd_reg16(info, RCR) & ~BIT1; /* clear enable bit */
+	wr_reg16(info, RCR, (unsigned short)(val | BIT2)); /* set reset bit */
+	wr_reg16(info, RCR, val);                  /* clear reset bit */
+
+	rdma_reset(info);
+	reset_rbufs(info);
+
+	/* set 1st descriptor address */
+	wr_reg32(info, RDDAR, info->rbufs[0].pdesc);
+
+	if (info->params.mode != MGSL_MODE_ASYNC) {
+		/* enable rx DMA and DMA interrupt */
+		wr_reg32(info, RDCSR, (BIT2 + BIT0));
+	} else {
+		/* enable saving of rx status, rx DMA and DMA interrupt */
+		wr_reg32(info, RDCSR, (BIT6 + BIT2 + BIT0));
+	}
+
+	slgt_irq_on(info, IRQ_RXOVER);
+
+	/* enable receiver */
+	wr_reg16(info, RCR, (unsigned short)(rd_reg16(info, RCR) | BIT1));
+
+	info->rx_restart = 0;
+	info->rx_enabled = 1;
+}
+
+static void tx_start(struct slgt_info *info)
+{
+	if (!info->tx_enabled) {
+		wr_reg16(info, TCR,
+			(unsigned short)(rd_reg16(info, TCR) | BIT1));
+		info->tx_enabled = TRUE;
+	}
+
+	if (info->tx_count) {
+		info->drop_rts_on_tx_done = 0;
+
+		if (info->params.mode != MGSL_MODE_ASYNC) {
+			if (info->params.flags & HDLC_FLAG_AUTO_RTS) {
+				get_signals(info);
+				if (!(info->signals & SerialSignal_RTS)) {
+					info->signals |= SerialSignal_RTS;
+					set_signals(info);
+					info->drop_rts_on_tx_done = 1;
+				}
+			}
+
+			slgt_irq_off(info, IRQ_TXDATA);
+			slgt_irq_on(info, IRQ_TXUNDER + IRQ_TXIDLE);
+			/* clear tx idle and underrun status bits */
+			wr_reg16(info, SSR, (unsigned short)(IRQ_TXIDLE + IRQ_TXUNDER));
+
+			if (!(rd_reg32(info, TDCSR) & BIT0)) {
+				/* tx DMA stopped, restart tx DMA */
+				tdma_reset(info);
+				/* set 1st descriptor address */
+				wr_reg32(info, TDDAR, info->tbufs[info->tbuf_start].pdesc);
+				if (info->params.mode == MGSL_MODE_RAW)
+					wr_reg32(info, TDCSR, BIT2 + BIT0); /* IRQ + DMA enable */
+				else
+					wr_reg32(info, TDCSR, BIT0); /* DMA enable */
+			}
+
+			if (info->params.mode != MGSL_MODE_RAW) {
+				info->tx_timer.expires = jiffies + msecs_to_jiffies(5000);
+				add_timer(&info->tx_timer);
+			}
+		} else {
+			tdma_reset(info);
+			/* set 1st descriptor address */
+			wr_reg32(info, TDDAR, info->tbufs[info->tbuf_start].pdesc);
+
+			slgt_irq_off(info, IRQ_TXDATA);
+			slgt_irq_on(info, IRQ_TXIDLE);
+			/* clear tx idle status bit */
+			wr_reg16(info, SSR, IRQ_TXIDLE);
+
+			/* enable tx DMA */
+			wr_reg32(info, TDCSR, BIT0);
+		}
+
+		info->tx_active = 1;
+	}
+}
+
+static void tx_stop(struct slgt_info *info)
+{
+	unsigned short val;
+
+	del_timer(&info->tx_timer);
+
+	tdma_reset(info);
+
+	/* reset and disable transmitter */
+	val = rd_reg16(info, TCR) & ~BIT1;          /* clear enable bit */
+	wr_reg16(info, TCR, (unsigned short)(val | BIT2)); /* set reset bit */
+	wr_reg16(info, TCR, val);                  /* clear reset */
+
+	slgt_irq_off(info, IRQ_TXDATA + IRQ_TXIDLE + IRQ_TXUNDER);
+
+	/* clear tx idle and underrun status bit */
+	wr_reg16(info, SSR, (unsigned short)(IRQ_TXIDLE + IRQ_TXUNDER));
+
+	reset_tbufs(info);
+
+	info->tx_enabled = 0;
+	info->tx_active  = 0;
+}
+
+static void reset_port(struct slgt_info *info)
+{
+	if (!info->reg_addr)
+		return;
+
+	tx_stop(info);
+	rx_stop(info);
+
+	info->signals &= ~(SerialSignal_DTR + SerialSignal_RTS);
+	set_signals(info);
+
+	slgt_irq_off(info, IRQ_ALL | IRQ_MASTER);
+}
+
+static void reset_adapter(struct slgt_info *info)
+{
+	int i;
+	for (i=0; i < info->port_count; ++i) {
+		if (info->port_array[i])
+			reset_port(info->port_array[i]);
+	}
+}
+
+static void async_mode(struct slgt_info *info)
+{
+  	unsigned short val;
+
+	slgt_irq_off(info, IRQ_ALL | IRQ_MASTER);
+	tx_stop(info);
+	rx_stop(info);
+
+	/* TCR (tx control)
+	 *
+	 * 15..13  mode, 010=async
+	 * 12..10  encoding, 000=NRZ
+	 * 09      parity enable
+	 * 08      1=odd parity, 0=even parity
+	 * 07      1=RTS driver control
+	 * 06      1=break enable
+	 * 05..04  character length
+	 *         00=5 bits
+	 *         01=6 bits
+	 *         10=7 bits
+	 *         11=8 bits
+	 * 03      0=1 stop bit, 1=2 stop bits
+	 * 02      reset
+	 * 01      enable
+	 * 00      auto-CTS enable
+	 */
+	val = 0x4000;
+
+	if (info->if_mode & MGSL_INTERFACE_RTS_EN)
+		val |= BIT7;
+
+	if (info->params.parity != ASYNC_PARITY_NONE) {
+		val |= BIT9;
+		if (info->params.parity == ASYNC_PARITY_ODD)
+			val |= BIT8;
+	}
+
+	switch (info->params.data_bits)
+	{
+	case 6: val |= BIT4; break;
+	case 7: val |= BIT5; break;
+	case 8: val |= BIT5 + BIT4; break;
+	}
+
+	if (info->params.stop_bits != 1)
+		val |= BIT3;
+
+	if (info->params.flags & HDLC_FLAG_AUTO_CTS)
+		val |= BIT0;
+
+	wr_reg16(info, TCR, val);
+
+	/* RCR (rx control)
+	 *
+	 * 15..13  mode, 010=async
+	 * 12..10  encoding, 000=NRZ
+	 * 09      parity enable
+	 * 08      1=odd parity, 0=even parity
+	 * 07..06  reserved, must be 0
+	 * 05..04  character length
+	 *         00=5 bits
+	 *         01=6 bits
+	 *         10=7 bits
+	 *         11=8 bits
+	 * 03      reserved, must be zero
+	 * 02      reset
+	 * 01      enable
+	 * 00      auto-DCD enable
+	 */
+	val = 0x4000;
+
+	if (info->params.parity != ASYNC_PARITY_NONE) {
+		val |= BIT9;
+		if (info->params.parity == ASYNC_PARITY_ODD)
+			val |= BIT8;
+	}
+
+	switch (info->params.data_bits)
+	{
+	case 6: val |= BIT4; break;
+	case 7: val |= BIT5; break;
+	case 8: val |= BIT5 + BIT4; break;
+	}
+
+	if (info->params.flags & HDLC_FLAG_AUTO_DCD)
+		val |= BIT0;
+
+	wr_reg16(info, RCR, val);
+
+	/* CCR (clock control)
+	 *
+	 * 07..05  011 = tx clock source is BRG/16
+	 * 04..02  010 = rx clock source is BRG
+	 * 01      0 = auxclk disabled
+	 * 00      1 = BRG enabled
+	 *
+	 * 0110 1001
+	 */
+	wr_reg8(info, CCR, 0x69);
+
+	msc_set_vcr(info);
+
+	tx_set_idle(info);
+
+	/* SCR (serial control)
+	 *
+	 * 15  1=tx req on FIFO half empty
+	 * 14  1=rx req on FIFO half full
+	 * 13  tx data  IRQ enable
+	 * 12  tx idle  IRQ enable
+	 * 11  rx break on IRQ enable
+	 * 10  rx data  IRQ enable
+	 * 09  rx break off IRQ enable
+	 * 08  overrun  IRQ enable
+	 * 07  DSR      IRQ enable
+	 * 06  CTS      IRQ enable
+	 * 05  DCD      IRQ enable
+	 * 04  RI       IRQ enable
+	 * 03  reserved, must be zero
+	 * 02  1=txd->rxd internal loopback enable
+	 * 01  reserved, must be zero
+	 * 00  1=master IRQ enable
+	 */
+	val = BIT15 + BIT14 + BIT0;
+	wr_reg16(info, SCR, val);
+
+	slgt_irq_on(info, IRQ_RXBREAK | IRQ_RXOVER);
+
+	set_rate(info, info->params.data_rate * 16);
+
+	if (info->params.loopback)
+		enable_loopback(info);
+}
+
+static void hdlc_mode(struct slgt_info *info)
+{
+	unsigned short val;
+
+	slgt_irq_off(info, IRQ_ALL | IRQ_MASTER);
+	tx_stop(info);
+	rx_stop(info);
+
+	/* TCR (tx control)
+	 *
+	 * 15..13  mode, 000=HDLC 001=raw sync
+	 * 12..10  encoding
+	 * 09      CRC enable
+	 * 08      CRC32
+	 * 07      1=RTS driver control
+	 * 06      preamble enable
+	 * 05..04  preamble length
+	 * 03      share open/close flag
+	 * 02      reset
+	 * 01      enable
+	 * 00      auto-CTS enable
+	 */
+	val = 0;
+
+	if (info->params.mode == MGSL_MODE_RAW)
+		val |= BIT13;
+	if (info->if_mode & MGSL_INTERFACE_RTS_EN)
+		val |= BIT7;
+
+	switch(info->params.encoding)
+	{
+	case HDLC_ENCODING_NRZB:          val |= BIT10; break;
+	case HDLC_ENCODING_NRZI_MARK:     val |= BIT11; break;
+	case HDLC_ENCODING_NRZI:          val |= BIT11 + BIT10; break;
+	case HDLC_ENCODING_BIPHASE_MARK:  val |= BIT12; break;
+	case HDLC_ENCODING_BIPHASE_SPACE: val |= BIT12 + BIT10; break;
+	case HDLC_ENCODING_BIPHASE_LEVEL: val |= BIT12 + BIT11; break;
+	case HDLC_ENCODING_DIFF_BIPHASE_LEVEL: val |= BIT12 + BIT11 + BIT10; break;
+	}
+
+	switch (info->params.crc_type)
+	{
+	case HDLC_CRC_16_CCITT: val |= BIT9; break;
+	case HDLC_CRC_32_CCITT: val |= BIT9 + BIT8; break;
+	}
+
+	if (info->params.preamble != HDLC_PREAMBLE_PATTERN_NONE)
+		val |= BIT6;
+
+	switch (info->params.preamble_length)
+	{
+	case HDLC_PREAMBLE_LENGTH_16BITS: val |= BIT5; break;
+	case HDLC_PREAMBLE_LENGTH_32BITS: val |= BIT4; break;
+	case HDLC_PREAMBLE_LENGTH_64BITS: val |= BIT5 + BIT4; break;
+	}
+
+	if (info->params.flags & HDLC_FLAG_AUTO_CTS)
+		val |= BIT0;
+
+	wr_reg16(info, TCR, val);
+
+	/* TPR (transmit preamble) */
+
+	switch (info->params.preamble)
+	{
+	case HDLC_PREAMBLE_PATTERN_FLAGS: val = 0x7e; break;
+	case HDLC_PREAMBLE_PATTERN_ONES:  val = 0xff; break;
+	case HDLC_PREAMBLE_PATTERN_ZEROS: val = 0x00; break;
+	case HDLC_PREAMBLE_PATTERN_10:    val = 0x55; break;
+	case HDLC_PREAMBLE_PATTERN_01:    val = 0xaa; break;
+	default:                          val = 0x7e; break;
+	}
+	wr_reg8(info, TPR, (unsigned char)val);
+
+	/* RCR (rx control)
+	 *
+	 * 15..13  mode, 000=HDLC 001=raw sync
+	 * 12..10  encoding
+	 * 09      CRC enable
+	 * 08      CRC32
+	 * 07..03  reserved, must be 0
+	 * 02      reset
+	 * 01      enable
+	 * 00      auto-DCD enable
+	 */
+	val = 0;
+
+	if (info->params.mode == MGSL_MODE_RAW)
+		val |= BIT13;
+
+	switch(info->params.encoding)
+	{
+	case HDLC_ENCODING_NRZB:          val |= BIT10; break;
+	case HDLC_ENCODING_NRZI_MARK:     val |= BIT11; break;
+	case HDLC_ENCODING_NRZI:          val |= BIT11 + BIT10; break;
+	case HDLC_ENCODING_BIPHASE_MARK:  val |= BIT12; break;
+	case HDLC_ENCODING_BIPHASE_SPACE: val |= BIT12 + BIT10; break;
+	case HDLC_ENCODING_BIPHASE_LEVEL: val |= BIT12 + BIT11; break;
+	case HDLC_ENCODING_DIFF_BIPHASE_LEVEL: val |= BIT12 + BIT11 + BIT10; break;
+	}
+
+	switch (info->params.crc_type)
+	{
+	case HDLC_CRC_16_CCITT: val |= BIT9; break;
+	case HDLC_CRC_32_CCITT: val |= BIT9 + BIT8; break;
+	}
+
+	if (info->params.flags & HDLC_FLAG_AUTO_DCD)
+		val |= BIT0;
+
+	wr_reg16(info, RCR, val);
+
+	/* CCR (clock control)
+	 *
+	 * 07..05  tx clock source
+	 * 04..02  rx clock source
+	 * 01      auxclk enable
+	 * 00      BRG enable
+	 */
+	val = 0;
+
+	if (info->params.flags & HDLC_FLAG_TXC_BRG)
+	{
+		// when RxC source is DPLL, BRG generates 16X DPLL
+		// reference clock, so take TxC from BRG/16 to get
+		// transmit clock at actual data rate
+		if (info->params.flags & HDLC_FLAG_RXC_DPLL)
+			val |= BIT6 + BIT5;	/* 011, txclk = BRG/16 */
+		else
+			val |= BIT6;	/* 010, txclk = BRG */
+	}
+	else if (info->params.flags & HDLC_FLAG_TXC_DPLL)
+		val |= BIT7;	/* 100, txclk = DPLL Input */
+	else if (info->params.flags & HDLC_FLAG_TXC_RXCPIN)
+		val |= BIT5;	/* 001, txclk = RXC Input */
+
+	if (info->params.flags & HDLC_FLAG_RXC_BRG)
+		val |= BIT3;	/* 010, rxclk = BRG */
+	else if (info->params.flags & HDLC_FLAG_RXC_DPLL)
+		val |= BIT4;	/* 100, rxclk = DPLL */
+	else if (info->params.flags & HDLC_FLAG_RXC_TXCPIN)
+		val |= BIT2;	/* 001, rxclk = TXC Input */
+
+	if (info->params.clock_speed)
+		val |= BIT1 + BIT0;
+
+	wr_reg8(info, CCR, (unsigned char)val);
+
+	if (info->params.flags & (HDLC_FLAG_TXC_DPLL + HDLC_FLAG_RXC_DPLL))
+	{
+		// program DPLL mode
+		switch(info->params.encoding)
+		{
+		case HDLC_ENCODING_BIPHASE_MARK:
+		case HDLC_ENCODING_BIPHASE_SPACE:
+			val = BIT7; break;
+		case HDLC_ENCODING_BIPHASE_LEVEL:
+		case HDLC_ENCODING_DIFF_BIPHASE_LEVEL:
+			val = BIT7 + BIT6; break;
+		default: val = BIT6;	// NRZ encodings
+		}
+		wr_reg16(info, RCR, (unsigned short)(rd_reg16(info, RCR) | val));
+
+		// DPLL requires a 16X reference clock from BRG
+		set_rate(info, info->params.clock_speed * 16);
+	}
+	else
+		set_rate(info, info->params.clock_speed);
+
+	tx_set_idle(info);
+
+	msc_set_vcr(info);
+
+	/* SCR (serial control)
+	 *
+	 * 15  1=tx req on FIFO half empty
+	 * 14  1=rx req on FIFO half full
+	 * 13  tx data  IRQ enable
+	 * 12  tx idle  IRQ enable
+	 * 11  underrun IRQ enable
+	 * 10  rx data  IRQ enable
+	 * 09  rx idle  IRQ enable
+	 * 08  overrun  IRQ enable
+	 * 07  DSR      IRQ enable
+	 * 06  CTS      IRQ enable
+	 * 05  DCD      IRQ enable
+	 * 04  RI       IRQ enable
+	 * 03  reserved, must be zero
+	 * 02  1=txd->rxd internal loopback enable
+	 * 01  reserved, must be zero
+	 * 00  1=master IRQ enable
+	 */
+	wr_reg16(info, SCR, BIT15 + BIT14 + BIT0);
+
+	if (info->params.loopback)
+		enable_loopback(info);
+}
+
+/*
+ *  set transmit idle mode
+ */
+static void tx_set_idle(struct slgt_info *info)
+{
+	unsigned char val = 0xff;
+
+	switch(info->idle_mode)
+	{
+	case HDLC_TXIDLE_FLAGS:          val = 0x7e; break;
+	case HDLC_TXIDLE_ALT_ZEROS_ONES: val = 0xaa; break;
+	case HDLC_TXIDLE_ZEROS:          val = 0x00; break;
+	case HDLC_TXIDLE_ONES:           val = 0xff; break;
+	case HDLC_TXIDLE_ALT_MARK_SPACE: val = 0xaa; break;
+	case HDLC_TXIDLE_SPACE:          val = 0x00; break;
+	case HDLC_TXIDLE_MARK:           val = 0xff; break;
+	}
+
+	wr_reg8(info, TIR, val);
+}
+
+/*
+ * get state of V24 status (input) signals
+ */
+static void get_signals(struct slgt_info *info)
+{
+	unsigned short status = rd_reg16(info, SSR);
+
+	/* clear all serial signals except DTR and RTS */
+	info->signals &= SerialSignal_DTR + SerialSignal_RTS;
+
+	if (status & BIT3)
+		info->signals |= SerialSignal_DSR;
+	if (status & BIT2)
+		info->signals |= SerialSignal_CTS;
+	if (status & BIT1)
+		info->signals |= SerialSignal_DCD;
+	if (status & BIT0)
+		info->signals |= SerialSignal_RI;
+}
+
+/*
+ * set V.24 Control Register based on current configuration
+ */
+static void msc_set_vcr(struct slgt_info *info)
+{
+	unsigned char val = 0;
+
+	/* VCR (V.24 control)
+	 *
+	 * 07..04  serial IF select
+	 * 03      DTR
+	 * 02      RTS
+	 * 01      LL
+	 * 00      RL
+	 */
+
+	switch(info->if_mode & MGSL_INTERFACE_MASK)
+	{
+	case MGSL_INTERFACE_RS232:
+		val |= BIT5; /* 0010 */
+		break;
+	case MGSL_INTERFACE_V35:
+		val |= BIT7 + BIT6 + BIT5; /* 1110 */
+		break;
+	case MGSL_INTERFACE_RS422:
+		val |= BIT6; /* 0100 */
+		break;
+	}
+
+	if (info->signals & SerialSignal_DTR)
+		val |= BIT3;
+	if (info->signals & SerialSignal_RTS)
+		val |= BIT2;
+	if (info->if_mode & MGSL_INTERFACE_LL)
+		val |= BIT1;
+	if (info->if_mode & MGSL_INTERFACE_RL)
+		val |= BIT0;
+	wr_reg8(info, VCR, val);
+}
+
+/*
+ * set state of V24 control (output) signals
+ */
+static void set_signals(struct slgt_info *info)
+{
+	unsigned char val = rd_reg8(info, VCR);
+	if (info->signals & SerialSignal_DTR)
+		val |= BIT3;
+	else
+		val &= ~BIT3;
+	if (info->signals & SerialSignal_RTS)
+		val |= BIT2;
+	else
+		val &= ~BIT2;
+	wr_reg8(info, VCR, val);
+}
+
+/*
+ * free range of receive DMA buffers (i to last)
+ */
+static void free_rbufs(struct slgt_info *info, unsigned int i, unsigned int last)
+{
+	int done = 0;
+
+	while(!done) {
+		/* reset current buffer for reuse */
+		info->rbufs[i].status = 0;
+		if (info->params.mode == MGSL_MODE_RAW)
+			set_desc_count(info->rbufs[i], info->raw_rx_size);
+		else
+			set_desc_count(info->rbufs[i], DMABUFSIZE);
+
+		if (i == last)
+			done = 1;
+		if (++i == info->rbuf_count)
+			i = 0;
+	}
+	info->rbuf_current = i;
+}
+
+/*
+ * mark all receive DMA buffers as free
+ */
+static void reset_rbufs(struct slgt_info *info)
+{
+	free_rbufs(info, 0, info->rbuf_count - 1);
+}
+
+/*
+ * pass receive HDLC frame to upper layer
+ *
+ * return 1 if frame available, otherwise 0
+ */
+static int rx_get_frame(struct slgt_info *info)
+{
+	unsigned int start, end;
+	unsigned short status;
+	unsigned int framesize = 0;
+	int rc = 0;
+	unsigned long flags;
+	struct tty_struct *tty = info->tty;
+	unsigned char addr_field = 0xff;
+
+check_again:
+
+	framesize = 0;
+	addr_field = 0xff;
+	start = end = info->rbuf_current;
+
+	for (;;) {
+		if (!desc_complete(info->rbufs[end]))
+			goto cleanup;
+
+		if (framesize == 0 && info->params.addr_filter != 0xff)
+			addr_field = info->rbufs[end].buf[0];
+
+		framesize += desc_count(info->rbufs[end]);
+
+		if (desc_eof(info->rbufs[end]))
+			break;
+
+		if (++end == info->rbuf_count)
+			end = 0;
+
+		if (end == info->rbuf_current) {
+			if (info->rx_enabled){
+				spin_lock_irqsave(&info->lock,flags);
+				rx_start(info);
+				spin_unlock_irqrestore(&info->lock,flags);
+			}
+			goto cleanup;
+		}
+	}
+
+	/* status
+	 *
+	 * 15      buffer complete
+	 * 14..06  reserved
+	 * 05..04  residue
+	 * 02      eof (end of frame)
+	 * 01      CRC error
+	 * 00      abort
+	 */
+	status = desc_status(info->rbufs[end]);
+
+	/* ignore CRC bit if not using CRC (bit is undefined) */
+	if (info->params.crc_type == HDLC_CRC_NONE)
+		status &= ~BIT1;
+
+	if (framesize == 0 ||
+		 (addr_field != 0xff && addr_field != info->params.addr_filter)) {
+		free_rbufs(info, start, end);
+		goto check_again;
+	}
+
+	if (framesize < 2 || status & (BIT1+BIT0)) {
+		if (framesize < 2 || (status & BIT0))
+			info->icount.rxshort++;
+		else
+			info->icount.rxcrc++;
+		framesize = 0;
+
+#ifdef CONFIG_HDLC
+		{
+			struct net_device_stats *stats = hdlc_stats(info->netdev);
+			stats->rx_errors++;
+			stats->rx_frame_errors++;
+		}
+#endif
+	} else {
+		/* adjust frame size for CRC, if any */
+		if (info->params.crc_type == HDLC_CRC_16_CCITT)
+			framesize -= 2;
+		else if (info->params.crc_type == HDLC_CRC_32_CCITT)
+			framesize -= 4;
+	}
+
+	DBGBH(("%s rx frame status=%04X size=%d\n",
+		info->device_name, status, framesize));
+	DBGDATA(info, info->rbufs[start].buf, min_t(int, framesize, DMABUFSIZE), "rx");
+
+	if (framesize) {
+		if (framesize > info->max_frame_size)
+			info->icount.rxlong++;
+		else {
+			/* copy dma buffer(s) to contiguous temp buffer */
+			int copy_count = framesize;
+			int i = start;
+			unsigned char *p = info->tmp_rbuf;
+			info->tmp_rbuf_count = framesize;
+
+			info->icount.rxok++;
+
+			while(copy_count) {
+				int partial_count = min(copy_count, DMABUFSIZE);
+				memcpy(p, info->rbufs[i].buf, partial_count);
+				p += partial_count;
+				copy_count -= partial_count;
+				if (++i == info->rbuf_count)
+					i = 0;
+			}
+
+#ifdef CONFIG_HDLC
+			if (info->netcount)
+				hdlcdev_rx(info,info->tmp_rbuf, framesize);
+			else
+#endif
+				ldisc_receive_buf(tty, info->tmp_rbuf, info->flag_buf, framesize);
+		}
+	}
+	free_rbufs(info, start, end);
+	rc = 1;
+
+cleanup:
+	return rc;
+}
+
+/*
+ * pass receive buffer (RAW synchronous mode) to tty layer
+ * return 1 if buffer available, otherwise 0
+ */
+static int rx_get_buf(struct slgt_info *info)
+{
+	unsigned int i = info->rbuf_current;
+
+	if (!desc_complete(info->rbufs[i]))
+		return 0;
+	DBGDATA(info, info->rbufs[i].buf, desc_count(info->rbufs[i]), "rx");
+	DBGINFO(("rx_get_buf size=%d\n", desc_count(info->rbufs[i])));
+	ldisc_receive_buf(info->tty, info->rbufs[i].buf,
+			  info->flag_buf, desc_count(info->rbufs[i]));
+	free_rbufs(info, i, i);
+	return 1;
+}
+
+static void reset_tbufs(struct slgt_info *info)
+{
+	unsigned int i;
+	info->tbuf_current = 0;
+	for (i=0 ; i < info->tbuf_count ; i++) {
+		info->tbufs[i].status = 0;
+		info->tbufs[i].count  = 0;
+	}
+}
+
+/*
+ * return number of free transmit DMA buffers
+ */
+static unsigned int free_tbuf_count(struct slgt_info *info)
+{
+	unsigned int count = 0;
+	unsigned int i = info->tbuf_current;
+
+	do
+	{
+		if (desc_count(info->tbufs[i]))
+			break; /* buffer in use */
+		++count;
+		if (++i == info->tbuf_count)
+			i=0;
+	} while (i != info->tbuf_current);
+
+	/* last buffer with zero count may be in use, assume it is */
+	if (count)
+		--count;
+
+	return count;
+}
+
+/*
+ * load transmit DMA buffer(s) with data
+ */
+static void tx_load(struct slgt_info *info, const char *buf, unsigned int size)
+{
+	unsigned short count;
+	unsigned int i;
+	struct slgt_desc *d;
+
+	if (size == 0)
+		return;
+
+	DBGDATA(info, buf, size, "tx");
+
+	info->tbuf_start = i = info->tbuf_current;
+
+	while (size) {
+		d = &info->tbufs[i];
+		if (++i == info->tbuf_count)
+			i = 0;
+
+		count = (unsigned short)((size > DMABUFSIZE) ? DMABUFSIZE : size);
+		memcpy(d->buf, buf, count);
+
+		size -= count;
+		buf  += count;
+
+		if (!size && info->params.mode != MGSL_MODE_RAW)
+			set_desc_eof(*d, 1); /* HDLC: set EOF of last desc */
+		else
+			set_desc_eof(*d, 0);
+
+		set_desc_count(*d, count);
+	}
+
+	info->tbuf_current = i;
+}
+
+static int register_test(struct slgt_info *info)
+{
+	static unsigned short patterns[] =
+		{0x0000, 0xffff, 0xaaaa, 0x5555, 0x6969, 0x9696};
+	static unsigned int count = sizeof(patterns)/sizeof(patterns[0]);
+	unsigned int i;
+	int rc = 0;
+
+	for (i=0 ; i < count ; i++) {
+		wr_reg16(info, TIR, patterns[i]);
+		wr_reg16(info, BDR, patterns[(i+1)%count]);
+		if ((rd_reg16(info, TIR) != patterns[i]) ||
+		    (rd_reg16(info, BDR) != patterns[(i+1)%count])) {
+			rc = -ENODEV;
+			break;
+		}
+	}
+
+	info->init_error = rc ? 0 : DiagStatus_AddressFailure;
+	return rc;
+}
+
+static int irq_test(struct slgt_info *info)
+{
+	unsigned long timeout;
+	unsigned long flags;
+	struct tty_struct *oldtty = info->tty;
+	u32 speed = info->params.data_rate;
+
+	info->params.data_rate = 921600;
+	info->tty = NULL;
+
+	spin_lock_irqsave(&info->lock, flags);
+	async_mode(info);
+	slgt_irq_on(info, IRQ_TXIDLE);
+
+	/* enable transmitter */
+	wr_reg16(info, TCR,
+		(unsigned short)(rd_reg16(info, TCR) | BIT1));
+
+	/* write one byte and wait for tx idle */
+	wr_reg16(info, TDR, 0);
+
+	/* assume failure */
+	info->init_error = DiagStatus_IrqFailure;
+	info->irq_occurred = FALSE;
+
+	spin_unlock_irqrestore(&info->lock, flags);
+
+	timeout=100;
+	while(timeout-- && !info->irq_occurred)
+		msleep_interruptible(10);
+
+	spin_lock_irqsave(&info->lock,flags);
+	reset_port(info);
+	spin_unlock_irqrestore(&info->lock,flags);
+
+	info->params.data_rate = speed;
+	info->tty = oldtty;
+
+	info->init_error = info->irq_occurred ? 0 : DiagStatus_IrqFailure;
+	return info->irq_occurred ? 0 : -ENODEV;
+}
+
+static int loopback_test_rx(struct slgt_info *info)
+{
+	unsigned char *src, *dest;
+	int count;
+
+	if (desc_complete(info->rbufs[0])) {
+		count = desc_count(info->rbufs[0]);
+		src   = info->rbufs[0].buf;
+		dest  = info->tmp_rbuf;
+
+		for( ; count ; count-=2, src+=2) {
+			/* src=data byte (src+1)=status byte */
+			if (!(*(src+1) & (BIT9 + BIT8))) {
+				*dest = *src;
+				dest++;
+				info->tmp_rbuf_count++;
+			}
+		}
+		DBGDATA(info, info->tmp_rbuf, info->tmp_rbuf_count, "rx");
+		return 1;
+	}
+	return 0;
+}
+
+static int loopback_test(struct slgt_info *info)
+{
+#define TESTFRAMESIZE 20
+
+	unsigned long timeout;
+	u16 count = TESTFRAMESIZE;
+	unsigned char buf[TESTFRAMESIZE];
+	int rc = -ENODEV;
+	unsigned long flags;
+
+	struct tty_struct *oldtty = info->tty;
+	MGSL_PARAMS params;
+
+	memcpy(&params, &info->params, sizeof(params));
+
+	info->params.mode = MGSL_MODE_ASYNC;
+	info->params.data_rate = 921600;
+	info->params.loopback = 1;
+	info->tty = NULL;
+
+	/* build and send transmit frame */
+	for (count = 0; count < TESTFRAMESIZE; ++count)
+		buf[count] = (unsigned char)count;
+
+	info->tmp_rbuf_count = 0;
+	memset(info->tmp_rbuf, 0, TESTFRAMESIZE);
+
+	/* program hardware for HDLC and enabled receiver */
+	spin_lock_irqsave(&info->lock,flags);
+	async_mode(info);
+	rx_start(info);
+	info->tx_count = count;
+	tx_load(info, buf, count);
+	tx_start(info);
+	spin_unlock_irqrestore(&info->lock, flags);
+
+	/* wait for receive complete */
+	for (timeout = 100; timeout; --timeout) {
+		msleep_interruptible(10);
+		if (loopback_test_rx(info)) {
+			rc = 0;
+			break;
+		}
+	}
+
+	/* verify received frame length and contents */
+	if (!rc && (info->tmp_rbuf_count != count ||
+		  memcmp(buf, info->tmp_rbuf, count))) {
+		rc = -ENODEV;
+	}
+
+	spin_lock_irqsave(&info->lock,flags);
+	reset_adapter(info);
+	spin_unlock_irqrestore(&info->lock,flags);
+
+	memcpy(&info->params, &params, sizeof(info->params));
+	info->tty = oldtty;
+
+	info->init_error = rc ? DiagStatus_DmaFailure : 0;
+	return rc;
+}
+
+static int adapter_test(struct slgt_info *info)
+{
+	DBGINFO(("testing %s\n", info->device_name));
+	if ((info->init_error = register_test(info)) < 0) {
+		printk("register test failure %s addr=%08X\n",
+			info->device_name, info->phys_reg_addr);
+	} else if ((info->init_error = irq_test(info)) < 0) {
+		printk("IRQ test failure %s IRQ=%d\n",
+			info->device_name, info->irq_level);
+	} else if ((info->init_error = loopback_test(info)) < 0) {
+		printk("loopback test failure %s\n", info->device_name);
+	}
+	return info->init_error;
+}
+
+/*
+ * transmit timeout handler
+ */
+static void tx_timeout(unsigned long context)
+{
+	struct slgt_info *info = (struct slgt_info*)context;
+	unsigned long flags;
+
+	DBGINFO(("%s tx_timeout\n", info->device_name));
+	if(info->tx_active && info->params.mode == MGSL_MODE_HDLC) {
+		info->icount.txtimeout++;
+	}
+	spin_lock_irqsave(&info->lock,flags);
+	info->tx_active = 0;
+	info->tx_count = 0;
+	spin_unlock_irqrestore(&info->lock,flags);
+
+#ifdef CONFIG_HDLC
+	if (info->netcount)
+		hdlcdev_tx_done(info);
+	else
+#endif
+		bh_transmit(info);
+}
+
+/*
+ * receive buffer polling timer
+ */
+static void rx_timeout(unsigned long context)
+{
+	struct slgt_info *info = (struct slgt_info*)context;
+	unsigned long flags;
+
+	DBGINFO(("%s rx_timeout\n", info->device_name));
+	spin_lock_irqsave(&info->lock, flags);
+	info->pending_bh |= BH_RECEIVE;
+	spin_unlock_irqrestore(&info->lock, flags);
+	bh_handler(info);
+}
+
diff --git a/include/linux/synclink.h b/include/linux/synclink.h
index 763bd290f2..1b7cd8d1a7 100644
--- a/include/linux/synclink.h
+++ b/include/linux/synclink.h
@@ -1,7 +1,7 @@
 /*
  * SyncLink Multiprotocol Serial Adapter Driver
  *
- * $Id: synclink.h,v 3.6 2002/02/20 21:58:20 paulkf Exp $
+ * $Id: synclink.h,v 3.10 2005/11/08 19:50:54 paulkf Exp $
  *
  * Copyright (C) 1998-2000 by Microgate Corporation
  *
@@ -128,10 +128,14 @@
 #define MGSL_BUS_TYPE_EISA	2
 #define MGSL_BUS_TYPE_PCI	5
 
+#define MGSL_INTERFACE_MASK     0xf
 #define MGSL_INTERFACE_DISABLE  0
 #define MGSL_INTERFACE_RS232    1
 #define MGSL_INTERFACE_V35      2
 #define MGSL_INTERFACE_RS422    3
+#define MGSL_INTERFACE_RTS_EN   0x10
+#define MGSL_INTERFACE_LL       0x20
+#define MGSL_INTERFACE_RL       0x40
 
 typedef struct _MGSL_PARAMS
 {
@@ -163,6 +167,9 @@ typedef struct _MGSL_PARAMS
 #define SYNCLINK_DEVICE_ID 0x0010
 #define MGSCC_DEVICE_ID 0x0020
 #define SYNCLINK_SCA_DEVICE_ID 0x0030
+#define SYNCLINK_GT_DEVICE_ID 0x0070
+#define SYNCLINK_GT4_DEVICE_ID 0x0080
+#define SYNCLINK_AC_DEVICE_ID  0x0090
 #define MGSL_MAX_SERIAL_NUMBER 30
 
 /*
-- 
cgit v1.2.2


From 9ded96f24c3a5fcbef954e88c443385a1af37eb9 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+lkml@arm.linux.org.uk>
Date: Sun, 8 Jan 2006 01:02:07 -0800
Subject: [PATCH] IRQ type flags

Some ARM platforms have the ability to program the interrupt controller to
detect various interrupt edges and/or levels.  For some platforms, this is
critical to setup correctly, particularly those which the setting is dependent
on the device.

Currently, ARM drivers do (eg) the following:

	err = request_irq(irq, ...);

	set_irq_type(irq, IRQT_RISING);

However, if the interrupt has previously been programmed to be level sensitive
(for whatever reason) then this will cause an interrupt storm.

Hence, if we combine set_irq_type() with request_irq(), we can then safely set
the type prior to unmasking the interrupt.  The unfortunate problem is that in
order to support this, these flags need to be visible outside of the ARM
architecture - drivers such as smc91x need these flags and they're
cross-architecture.

Finally, the SA_TRIGGER_* flag passed to request_irq() should reflect the
property that the device would like.  The IRQ controller code should do its
best to select the most appropriate supported mode.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/arm/kernel/irq.c              | 14 ++++++++++++--
 arch/arm/mach-omap1/serial.c       |  3 +--
 arch/arm/mach-pxa/corgi.c          |  7 +++----
 arch/arm/mach-pxa/poodle.c         |  7 +++----
 arch/arm/mach-pxa/spitz.c          |  7 +++----
 arch/arm/mach-s3c2410/usb-simtec.c |  6 +++---
 drivers/i2c/chips/tps65010.c       | 11 ++++++-----
 drivers/input/keyboard/corgikbd.c  |  6 ++----
 drivers/input/keyboard/spitzkbd.c  | 27 ++++++++++++++-------------
 drivers/mfd/ucb1x00-core.c         |  5 ++---
 drivers/net/smc91x.c               |  5 +----
 drivers/net/smc91x.h               | 18 +++++++++---------
 include/asm-arm/irq.h              | 12 ++++++++----
 include/linux/signal.h             | 13 +++++++++++++
 14 files changed, 80 insertions(+), 61 deletions(-)

diff --git a/arch/arm/kernel/irq.c b/arch/arm/kernel/irq.c
index 869c466e62..b5645c4462 100644
--- a/arch/arm/kernel/irq.c
+++ b/arch/arm/kernel/irq.c
@@ -684,8 +684,12 @@ int setup_irq(unsigned int irq, struct irqaction *new)
 	spin_lock_irqsave(&irq_controller_lock, flags);
 	p = &desc->action;
 	if ((old = *p) != NULL) {
-		/* Can't share interrupts unless both agree to */
-		if (!(old->flags & new->flags & SA_SHIRQ)) {
+		/*
+		 * Can't share interrupts unless both agree to and are
+		 * the same type.
+		 */
+		if (!(old->flags & new->flags & SA_SHIRQ) ||
+		    (~old->flags & new->flags) & SA_TRIGGER_MASK) {
 			spin_unlock_irqrestore(&irq_controller_lock, flags);
 			return -EBUSY;
 		}
@@ -705,6 +709,12 @@ int setup_irq(unsigned int irq, struct irqaction *new)
 		desc->running = 0;
 		desc->pending = 0;
 		desc->disable_depth = 1;
+
+		if (new->flags & SA_TRIGGER_MASK) {
+			unsigned int type = new->flags & SA_TRIGGER_MASK;
+			desc->chip->set_type(irq, type);
+		}
+
 		if (!desc->noautoenable) {
 			desc->disable_depth = 0;
 			desc->chip->unmask(irq);
diff --git a/arch/arm/mach-omap1/serial.c b/arch/arm/mach-omap1/serial.c
index fcfb81d13c..7a68f098a0 100644
--- a/arch/arm/mach-omap1/serial.c
+++ b/arch/arm/mach-omap1/serial.c
@@ -252,9 +252,8 @@ static void __init omap_serial_set_port_wakeup(int gpio_nr)
 		return;
 	}
 	omap_set_gpio_direction(gpio_nr, 1);
-	set_irq_type(OMAP_GPIO_IRQ(gpio_nr), IRQT_RISING);
 	ret = request_irq(OMAP_GPIO_IRQ(gpio_nr), &omap_serial_wake_interrupt,
-			  0, "serial wakeup", NULL);
+			  SA_TRIGGER_RISING, "serial wakeup", NULL);
 	if (ret) {
 		omap_free_gpio(gpio_nr);
 		printk(KERN_ERR "No interrupt for UART wake GPIO: %i\n",
diff --git a/arch/arm/mach-pxa/corgi.c b/arch/arm/mach-pxa/corgi.c
index 100fb31b51..5a7b873f29 100644
--- a/arch/arm/mach-pxa/corgi.c
+++ b/arch/arm/mach-pxa/corgi.c
@@ -213,15 +213,14 @@ static int corgi_mci_init(struct device *dev, irqreturn_t (*corgi_detect_int)(in
 
 	corgi_mci_platform_data.detect_delay = msecs_to_jiffies(250);
 
-	err = request_irq(CORGI_IRQ_GPIO_nSD_DETECT, corgi_detect_int, SA_INTERRUPT,
-			     "MMC card detect", data);
+	err = request_irq(CORGI_IRQ_GPIO_nSD_DETECT, corgi_detect_int,
+			  SA_INTERRUPT | SA_TRIGGER_RISING | SA_TRIGGER_FALLING,
+			  "MMC card detect", data);
 	if (err) {
 		printk(KERN_ERR "corgi_mci_init: MMC/SD: can't request MMC card detect IRQ\n");
 		return -1;
 	}
 
-	set_irq_type(CORGI_IRQ_GPIO_nSD_DETECT, IRQT_BOTHEDGE);
-
 	return 0;
 }
 
diff --git a/arch/arm/mach-pxa/poodle.c b/arch/arm/mach-pxa/poodle.c
index eef3de26ad..663c950059 100644
--- a/arch/arm/mach-pxa/poodle.c
+++ b/arch/arm/mach-pxa/poodle.c
@@ -146,15 +146,14 @@ static int poodle_mci_init(struct device *dev, irqreturn_t (*poodle_detect_int)(
 
 	poodle_mci_platform_data.detect_delay = msecs_to_jiffies(250);
 
-	err = request_irq(POODLE_IRQ_GPIO_nSD_DETECT, poodle_detect_int, SA_INTERRUPT,
-			     "MMC card detect", data);
+	err = request_irq(POODLE_IRQ_GPIO_nSD_DETECT, poodle_detect_int,
+			  SA_INTERRUPT | SA_TRIGGER_RISING | SA_TRIGGER_FALLING,
+			  "MMC card detect", data);
 	if (err) {
 		printk(KERN_ERR "poodle_mci_init: MMC/SD: can't request MMC card detect IRQ\n");
 		return -1;
 	}
 
-	set_irq_type(POODLE_IRQ_GPIO_nSD_DETECT, IRQT_BOTHEDGE);
-
 	return 0;
 }
 
diff --git a/arch/arm/mach-pxa/spitz.c b/arch/arm/mach-pxa/spitz.c
index f2007db0cd..a9eacc0655 100644
--- a/arch/arm/mach-pxa/spitz.c
+++ b/arch/arm/mach-pxa/spitz.c
@@ -296,15 +296,14 @@ static int spitz_mci_init(struct device *dev, irqreturn_t (*spitz_detect_int)(in
 
 	spitz_mci_platform_data.detect_delay = msecs_to_jiffies(250);
 
-	err = request_irq(SPITZ_IRQ_GPIO_nSD_DETECT, spitz_detect_int, SA_INTERRUPT,
-			     "MMC card detect", data);
+	err = request_irq(SPITZ_IRQ_GPIO_nSD_DETECT, spitz_detect_int,
+			  SA_INTERRUPT | SA_TRIGGER_RISING | SA_TRIGGER_FALLING,
+			  "MMC card detect", data);
 	if (err) {
 		printk(KERN_ERR "spitz_mci_init: MMC/SD: can't request MMC card detect IRQ\n");
 		return -1;
 	}
 
-	set_irq_type(SPITZ_IRQ_GPIO_nSD_DETECT, IRQT_BOTHEDGE);
-
 	return 0;
 }
 
diff --git a/arch/arm/mach-s3c2410/usb-simtec.c b/arch/arm/mach-s3c2410/usb-simtec.c
index 5098b50158..495f8c6ffc 100644
--- a/arch/arm/mach-s3c2410/usb-simtec.c
+++ b/arch/arm/mach-s3c2410/usb-simtec.c
@@ -84,13 +84,13 @@ static void usb_simtec_enableoc(struct s3c2410_hcd_info *info, int on)
 	int ret;
 
 	if (on) {
-		ret = request_irq(IRQ_USBOC, usb_simtec_ocirq, SA_INTERRUPT,
+		ret = request_irq(IRQ_USBOC, usb_simtec_ocirq,
+				  SA_INTERRUPT | SA_TRIGGER_RISING |
+				   SA_TRIGGER_FALLING,
 				  "USB Over-current", info);
 		if (ret != 0) {
 			printk(KERN_ERR "failed to request usb oc irq\n");
 		}
-
-		set_irq_type(IRQ_USBOC, IRQT_BOTHEDGE);
 	} else {
 		free_irq(IRQ_USBOC, info);
 	}
diff --git a/drivers/i2c/chips/tps65010.c b/drivers/i2c/chips/tps65010.c
index e70b3db69e..1af3dfbb80 100644
--- a/drivers/i2c/chips/tps65010.c
+++ b/drivers/i2c/chips/tps65010.c
@@ -494,6 +494,7 @@ tps65010_probe(struct i2c_adapter *bus, int address, int kind)
 {
 	struct tps65010		*tps;
 	int			status;
+	unsigned long		irqflags;
 
 	if (the_tps) {
 		dev_dbg(&bus->dev, "only one %s for now\n", DRIVER_NAME);
@@ -520,13 +521,14 @@ tps65010_probe(struct i2c_adapter *bus, int address, int kind)
 	}
 
 #ifdef	CONFIG_ARM
+	irqflags = SA_SAMPLE_RANDOM | SA_TRIGGER_LOW;
 	if (machine_is_omap_h2()) {
 		tps->model = TPS65010;
 		omap_cfg_reg(W4_GPIO58);
 		tps->irq = OMAP_GPIO_IRQ(58);
 		omap_request_gpio(58);
 		omap_set_gpio_direction(58, 1);
-		set_irq_type(tps->irq, IRQT_FALLING);
+		irqflags |= SA_TRIGGER_FALLING;
 	}
 	if (machine_is_omap_osk()) {
 		tps->model = TPS65010;
@@ -534,7 +536,7 @@ tps65010_probe(struct i2c_adapter *bus, int address, int kind)
 		tps->irq = OMAP_GPIO_IRQ(OMAP_MPUIO(1));
 		omap_request_gpio(OMAP_MPUIO(1));
 		omap_set_gpio_direction(OMAP_MPUIO(1), 1);
-		set_irq_type(tps->irq, IRQT_FALLING);
+		irqflags |= SA_TRIGGER_FALLING;
 	}
 	if (machine_is_omap_h3()) {
 		tps->model = TPS65013;
@@ -542,13 +544,12 @@ tps65010_probe(struct i2c_adapter *bus, int address, int kind)
 		// FIXME set up this board's IRQ ...
 	}
 #else
-#define set_irq_type(num,trigger)	do{}while(0)
+	irqflags = SA_SAMPLE_RANDOM;
 #endif
 
 	if (tps->irq > 0) {
-		set_irq_type(tps->irq, IRQT_LOW);
 		status = request_irq(tps->irq, tps65010_irq,
-			SA_SAMPLE_RANDOM, DRIVER_NAME, tps);
+			irqflags, DRIVER_NAME, tps);
 		if (status < 0) {
 			dev_dbg(&tps->client.dev, "can't get IRQ %d, err %d\n",
 					tps->irq, status);
diff --git a/drivers/input/keyboard/corgikbd.c b/drivers/input/keyboard/corgikbd.c
index 64672d4912..e301ee4ca2 100644
--- a/drivers/input/keyboard/corgikbd.c
+++ b/drivers/input/keyboard/corgikbd.c
@@ -19,7 +19,6 @@
 #include <linux/jiffies.h>
 #include <linux/module.h>
 #include <linux/slab.h>
-#include <asm/irq.h>
 
 #include <asm/arch/corgi.h>
 #include <asm/arch/hardware.h>
@@ -343,10 +342,9 @@ static int __init corgikbd_probe(struct platform_device *pdev)
 	for (i = 0; i < CORGI_KEY_SENSE_NUM; i++) {
 		pxa_gpio_mode(CORGI_GPIO_KEY_SENSE(i) | GPIO_IN);
 		if (request_irq(CORGI_IRQ_GPIO_KEY_SENSE(i), corgikbd_interrupt,
-						SA_INTERRUPT, "corgikbd", corgikbd))
+				SA_INTERRUPT | SA_TRIGGER_RISING,
+				"corgikbd", corgikbd))
 			printk(KERN_WARNING "corgikbd: Can't get IRQ: %d!\n", i);
-		else
-			set_irq_type(CORGI_IRQ_GPIO_KEY_SENSE(i),IRQT_RISING);
 	}
 
 	/* Set Strobe lines as outputs - set high */
diff --git a/drivers/input/keyboard/spitzkbd.c b/drivers/input/keyboard/spitzkbd.c
index 6a15fe3bc5..83999d5831 100644
--- a/drivers/input/keyboard/spitzkbd.c
+++ b/drivers/input/keyboard/spitzkbd.c
@@ -19,7 +19,6 @@
 #include <linux/jiffies.h>
 #include <linux/module.h>
 #include <linux/slab.h>
-#include <asm/irq.h>
 
 #include <asm/arch/spitz.h>
 #include <asm/arch/hardware.h>
@@ -407,10 +406,9 @@ static int __init spitzkbd_probe(struct platform_device *dev)
 	for (i = 0; i < SPITZ_KEY_SENSE_NUM; i++) {
 		pxa_gpio_mode(spitz_senses[i] | GPIO_IN);
 		if (request_irq(IRQ_GPIO(spitz_senses[i]), spitzkbd_interrupt,
-						SA_INTERRUPT, "Spitzkbd Sense", spitzkbd))
+				SA_INTERRUPT|SA_TRIGGER_RISING,
+				"Spitzkbd Sense", spitzkbd))
 			printk(KERN_WARNING "spitzkbd: Can't get Sense IRQ: %d!\n", i);
-		else
-			set_irq_type(IRQ_GPIO(spitz_senses[i]),IRQT_RISING);
 	}
 
 	/* Set Strobe lines as outputs - set high */
@@ -422,15 +420,18 @@ static int __init spitzkbd_probe(struct platform_device *dev)
 	pxa_gpio_mode(SPITZ_GPIO_SWA | GPIO_IN);
 	pxa_gpio_mode(SPITZ_GPIO_SWB | GPIO_IN);
 
-	request_irq(SPITZ_IRQ_GPIO_SYNC, spitzkbd_interrupt, SA_INTERRUPT, "Spitzkbd Sync", spitzkbd);
-	request_irq(SPITZ_IRQ_GPIO_ON_KEY, spitzkbd_interrupt, SA_INTERRUPT, "Spitzkbd PwrOn", spitzkbd);
-	request_irq(SPITZ_IRQ_GPIO_SWA, spitzkbd_hinge_isr, SA_INTERRUPT, "Spitzkbd SWA", spitzkbd);
-	request_irq(SPITZ_IRQ_GPIO_SWB, spitzkbd_hinge_isr, SA_INTERRUPT, "Spitzkbd SWB", spitzkbd);
-
-	set_irq_type(SPITZ_IRQ_GPIO_SYNC, IRQT_BOTHEDGE);
-	set_irq_type(SPITZ_IRQ_GPIO_ON_KEY, IRQT_BOTHEDGE);
-	set_irq_type(SPITZ_IRQ_GPIO_SWA, IRQT_BOTHEDGE);
-	set_irq_type(SPITZ_IRQ_GPIO_SWB, IRQT_BOTHEDGE);
+	request_irq(SPITZ_IRQ_GPIO_SYNC, spitzkbd_interrupt,
+		    SA_INTERRUPT | SA_TRIGGER_RISING | SA_TRIGGER_FALLING,
+		    "Spitzkbd Sync", spitzkbd);
+	request_irq(SPITZ_IRQ_GPIO_ON_KEY, spitzkbd_interrupt,
+		    SA_INTERRUPT | SA_TRIGGER_RISING | SA_TRIGGER_FALLING,
+		    "Spitzkbd PwrOn", spitzkbd);
+	request_irq(SPITZ_IRQ_GPIO_SWA, spitzkbd_hinge_isr,
+		    SA_INTERRUPT | SA_TRIGGER_RISING | SA_TRIGGER_FALLING,
+		    "Spitzkbd SWA", spitzkbd);
+	request_irq(SPITZ_IRQ_GPIO_SWB, spitzkbd_hinge_isr,
+		    SA_INTERRUPT | SA_TRIGGER_RISING | SA_TRIGGER_FALLING,
+		    "Spitzkbd SWB", spitzkbd);
 
 	printk(KERN_INFO "input: Spitz Keyboard Registered\n");
 
diff --git a/drivers/mfd/ucb1x00-core.c b/drivers/mfd/ucb1x00-core.c
index e335d54c46..b42e0fbab5 100644
--- a/drivers/mfd/ucb1x00-core.c
+++ b/drivers/mfd/ucb1x00-core.c
@@ -27,7 +27,6 @@
 
 #include <asm/dma.h>
 #include <asm/hardware.h>
-#include <asm/irq.h>
 
 #include "ucb1x00.h"
 
@@ -507,14 +506,14 @@ static int ucb1x00_probe(struct mcp *mcp)
 		goto err_free;
 	}
 
-	ret = request_irq(ucb->irq, ucb1x00_irq, 0, "UCB1x00", ucb);
+	ret = request_irq(ucb->irq, ucb1x00_irq, SA_TRIGGER_RISING,
+			  "UCB1x00", ucb);
 	if (ret) {
 		printk(KERN_ERR "ucb1x00: unable to grab irq%d: %d\n",
 			ucb->irq, ret);
 		goto err_free;
 	}
 
-	set_irq_type(ucb->irq, IRQT_RISING);
 	mcp_set_drvdata(mcp, ucb);
 
 	ret = class_device_register(&ucb->cdev);
diff --git a/drivers/net/smc91x.c b/drivers/net/smc91x.c
index 28bf2e69eb..7ec08127c9 100644
--- a/drivers/net/smc91x.c
+++ b/drivers/net/smc91x.c
@@ -88,7 +88,6 @@ static const char version[] =
 #include <linux/skbuff.h>
 
 #include <asm/io.h>
-#include <asm/irq.h>
 
 #include "smc91x.h"
 
@@ -2007,12 +2006,10 @@ static int __init smc_probe(struct net_device *dev, void __iomem *ioaddr)
 	}
 
 	/* Grab the IRQ */
-      	retval = request_irq(dev->irq, &smc_interrupt, 0, dev->name, dev);
+      	retval = request_irq(dev->irq, &smc_interrupt, SMC_IRQ_FLAGS, dev->name, dev);
       	if (retval)
       		goto err_out;
 
-	set_irq_type(dev->irq, SMC_IRQ_TRIGGER_TYPE);
-
 #ifdef SMC_USE_PXA_DMA
 	{
 		int dma = pxa_request_dma(dev->name, DMA_PRIO_LOW,
diff --git a/drivers/net/smc91x.h b/drivers/net/smc91x.h
index 5c2824be4e..e0efd1964e 100644
--- a/drivers/net/smc91x.h
+++ b/drivers/net/smc91x.h
@@ -90,7 +90,7 @@
 			__l--;						\
 		}							\
 	} while (0)
-#define set_irq_type(irq, type)
+#define SMC_IRQ_FLAGS		(0)
 
 #elif defined(CONFIG_SA1100_PLEB)
 /* We can only do 16-bit reads and writes in the static memory space. */
@@ -109,7 +109,7 @@
 #define SMC_outw(v, a, r)	writew(v, (a) + (r))
 #define SMC_outsw(a, r, p, l)	writesw((a) + (r), p, l)
 
-#define set_irq_type(irq, type) do {} while (0)
+#define SMC_IRQ_FLAGS		(0)
 
 #elif defined(CONFIG_SA1100_ASSABET)
 
@@ -185,11 +185,11 @@ SMC_outw(u16 val, void __iomem *ioaddr, int reg)
 #include <asm/mach-types.h>
 #include <asm/arch/cpu.h>
 
-#define	SMC_IRQ_TRIGGER_TYPE (( \
+#define	SMC_IRQ_FLAGS (( \
 		   machine_is_omap_h2() \
 		|| machine_is_omap_h3() \
 		|| (machine_is_omap_innovator() && !cpu_is_omap1510()) \
-	) ? IRQT_FALLING : IRQT_RISING)
+	) ? SA_TRIGGER_FALLING : SA_TRIGGER_RISING)
 
 
 #elif	defined(CONFIG_SH_SH4202_MICRODEV)
@@ -209,7 +209,7 @@ SMC_outw(u16 val, void __iomem *ioaddr, int reg)
 #define SMC_insw(a, r, p, l)	insw((a) + (r) - 0xa0000000, p, l)
 #define SMC_outsw(a, r, p, l)	outsw((a) + (r) - 0xa0000000, p, l)
 
-#define set_irq_type(irq, type)	do {} while(0)
+#define SMC_IRQ_FLAGS		(0)
 
 #elif	defined(CONFIG_ISA)
 
@@ -237,7 +237,7 @@ SMC_outw(u16 val, void __iomem *ioaddr, int reg)
 #define SMC_insw(a, r, p, l)	insw(((u32)a) + (r), p, l)
 #define SMC_outsw(a, r, p, l)	outsw(((u32)a) + (r), p, l)
 
-#define set_irq_type(irq, type)	do {} while(0)
+#define SMC_IRQ_FLAGS		(0)
 
 #define RPC_LSA_DEFAULT		RPC_LED_TX_RX
 #define RPC_LSB_DEFAULT		RPC_LED_100_10
@@ -319,7 +319,7 @@ static inline void SMC_outsw (unsigned long a, int r, unsigned char* p, int l)
 			au_writew(*_p++ , _a); \
 	} while(0)
 
-#define set_irq_type(irq, type) do {} while (0)
+#define SMC_IRQ_FLAGS		(0)
 
 #else
 
@@ -342,8 +342,8 @@ static inline void SMC_outsw (unsigned long a, int r, unsigned char* p, int l)
 
 #endif
 
-#ifndef	SMC_IRQ_TRIGGER_TYPE
-#define	SMC_IRQ_TRIGGER_TYPE	IRQT_RISING
+#ifndef	SMC_IRQ_FLAGS
+#define	SMC_IRQ_FLAGS		SA_TRIGGER_RISING
 #endif
 
 #ifdef SMC_USE_PXA_DMA
diff --git a/include/asm-arm/irq.h b/include/asm-arm/irq.h
index 59975ee43c..7772432d3f 100644
--- a/include/asm-arm/irq.h
+++ b/include/asm-arm/irq.h
@@ -25,10 +25,14 @@ extern void disable_irq_nosync(unsigned int);
 extern void disable_irq(unsigned int);
 extern void enable_irq(unsigned int);
 
-#define __IRQT_FALEDGE	(1 << 0)
-#define __IRQT_RISEDGE	(1 << 1)
-#define __IRQT_LOWLVL	(1 << 2)
-#define __IRQT_HIGHLVL	(1 << 3)
+/*
+ * These correspond with the SA_TRIGGER_* defines, and therefore the
+ * IRQRESOURCE_IRQ_* defines.
+ */
+#define __IRQT_RISEDGE	(1 << 0)
+#define __IRQT_FALEDGE	(1 << 1)
+#define __IRQT_HIGHLVL	(1 << 2)
+#define __IRQT_LOWLVL	(1 << 3)
 
 #define IRQT_NOEDGE	(0)
 #define IRQT_RISING	(__IRQT_RISEDGE)
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 5dd5f02c5c..ea9eff16c4 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -18,6 +18,19 @@
 #define SA_PROBE		SA_ONESHOT
 #define SA_SAMPLE_RANDOM	SA_RESTART
 #define SA_SHIRQ		0x04000000
+/*
+ * As above, these correspond to the IORESOURCE_IRQ_* defines in
+ * linux/ioport.h to select the interrupt line behaviour.  When
+ * requesting an interrupt without specifying a SA_TRIGGER, the
+ * setting should be assumed to be "as already configured", which
+ * may be as per machine or firmware initialisation.
+ */
+#define SA_TRIGGER_LOW		0x00000008
+#define SA_TRIGGER_HIGH		0x00000004
+#define SA_TRIGGER_FALLING	0x00000002
+#define SA_TRIGGER_RISING	0x00000001
+#define SA_TRIGGER_MASK	(SA_TRIGGER_HIGH|SA_TRIGGER_LOW|\
+				 SA_TRIGGER_RISING|SA_TRIGGER_FALLING)
 
 /*
  * Real Time signals may be queued.
-- 
cgit v1.2.2


From a6bf6b211cdb92c315c24719a522d8b6f3998210 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Sun, 8 Jan 2006 01:02:08 -0800
Subject: [PATCH] fat: move fat_clusters_flush() to write_super()

It is overkill to update the FS_INFO whenever modifying
prev_free/free_clusters, because those are just a hint.

So, this patch uses ->write_super() for updating FS_INFO instead.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fat/fatent.c |  8 ++++++--
 fs/fat/inode.c  | 10 ++++++++--
 fs/fat/misc.c   |  2 --
 3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 4164cd54c4..20a22875f0 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -476,6 +476,7 @@ int fat_alloc_clusters(struct inode *inode, int *cluster, int nr_cluster)
 				sbi->prev_free = entry;
 				if (sbi->free_clusters != -1)
 					sbi->free_clusters--;
+				sb->s_dirt = 1;
 
 				cluster[idx_clus] = entry;
 				idx_clus++;
@@ -496,6 +497,7 @@ int fat_alloc_clusters(struct inode *inode, int *cluster, int nr_cluster)
 
 	/* Couldn't allocate the free entries */
 	sbi->free_clusters = 0;
+	sb->s_dirt = 1;
 	err = -ENOSPC;
 
 out:
@@ -509,7 +511,6 @@ out:
 	}
 	for (i = 0; i < nr_bhs; i++)
 		brelse(bhs[i]);
-	fat_clusters_flush(sb);
 
 	if (err && idx_clus)
 		fat_free_clusters(inode, cluster[0]);
@@ -542,8 +543,10 @@ int fat_free_clusters(struct inode *inode, int cluster)
 		}
 
 		ops->ent_put(&fatent, FAT_ENT_FREE);
-		if (sbi->free_clusters != -1)
+		if (sbi->free_clusters != -1) {
 			sbi->free_clusters++;
+			sb->s_dirt = 1;
+		}
 
 		if (nr_bhs + fatent.nr_bhs > MAX_BUF_PER_PAGE) {
 			if (sb->s_flags & MS_SYNCHRONOUS) {
@@ -605,6 +608,7 @@ int fat_count_free_clusters(struct super_block *sb)
 		} while (fat_ent_next(sbi, &fatent));
 	}
 	sbi->free_clusters = free;
+	sb->s_dirt = 1;
 	fatent_brelse(&fatent);
 out:
 	unlock_fat(sbi);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index a0f9b9fe13..897312616b 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -374,12 +374,17 @@ static void fat_clear_inode(struct inode *inode)
 	unlock_kernel();
 }
 
-static void fat_put_super(struct super_block *sb)
+static void fat_write_super(struct super_block *sb)
 {
-	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	sb->s_dirt = 0;
 
 	if (!(sb->s_flags & MS_RDONLY))
 		fat_clusters_flush(sb);
+}
+
+static void fat_put_super(struct super_block *sb)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
 
 	if (sbi->nls_disk) {
 		unload_nls(sbi->nls_disk);
@@ -546,6 +551,7 @@ static struct super_operations fat_sops = {
 	.write_inode	= fat_write_inode,
 	.delete_inode	= fat_delete_inode,
 	.put_super	= fat_put_super,
+	.write_super	= fat_write_super,
 	.statfs		= fat_statfs,
 	.clear_inode	= fat_clear_inode,
 	.remount_fs	= fat_remount,
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 2a0df2122f..9b592e37e2 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -67,8 +67,6 @@ void fat_clusters_flush(struct super_block *sb)
 		if (sbi->prev_free != -1)
 			fsinfo->next_cluster = cpu_to_le32(sbi->prev_free);
 		mark_buffer_dirty(bh);
-		if (sb->s_flags & MS_SYNCHRONOUS)
-			sync_dirty_buffer(bh);
 	}
 	brelse(bh);
 }
-- 
cgit v1.2.2


From 83b7c996dc859c7b53f94d46ee5c5929cc0399e2 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Sun, 8 Jan 2006 01:02:09 -0800
Subject: [PATCH] fat: use sb_find_get_block() instead of sb_getblk()

We don't need to allocate buffer for checking the buffer is uptodate.  This
use sb_find_get_block() instead, and if it returns NULL it's not uptodate.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fat/dir.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index ba824964b9..b2a26cd226 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -45,8 +45,8 @@ static inline void fat_dir_readahead(struct inode *dir, sector_t iblock,
 	if ((sbi->fat_bits != 32) && (dir->i_ino == MSDOS_ROOT_INO))
 		return;
 
-	bh = sb_getblk(sb, phys);
-	if (bh && !buffer_uptodate(bh)) {
+	bh = sb_find_get_block(sb, phys);
+	if (bh == NULL || !buffer_uptodate(bh)) {
 		for (sec = 0; sec < sbi->sec_per_clus; sec++)
 			sb_breadahead(sb, phys + sec);
 	}
-- 
cgit v1.2.2


From a5425d2927a6a771f9ae8767b6bfb3c09225bcdd Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Sun, 8 Jan 2006 01:02:10 -0800
Subject: [PATCH] fat: add the read/writepages()

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fat/inode.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 897312616b..f502c6b8cb 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -18,6 +18,7 @@
 #include <linux/seq_file.h>
 #include <linux/msdos_fs.h>
 #include <linux/pagemap.h>
+#include <linux/mpage.h>
 #include <linux/buffer_head.h>
 #include <linux/mount.h>
 #include <linux/vfs.h>
@@ -90,9 +91,21 @@ static int fat_writepage(struct page *page, struct writeback_control *wbc)
 	return block_write_full_page(page, fat_get_block, wbc);
 }
 
+static int fat_writepages(struct address_space *mapping,
+			  struct writeback_control *wbc)
+{
+	return mpage_writepages(mapping, wbc, fat_get_block);
+}
+
 static int fat_readpage(struct file *file, struct page *page)
 {
-	return block_read_full_page(page, fat_get_block);
+	return mpage_readpage(page, fat_get_block);
+}
+
+static int fat_readpages(struct file *file, struct address_space *mapping,
+			 struct list_head *pages, unsigned nr_pages)
+{
+	return mpage_readpages(mapping, pages, nr_pages, fat_get_block);
 }
 
 static int fat_prepare_write(struct file *file, struct page *page,
@@ -122,7 +135,9 @@ static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
 
 static struct address_space_operations fat_aops = {
 	.readpage	= fat_readpage,
+	.readpages	= fat_readpages,
 	.writepage	= fat_writepage,
+	.writepages	= fat_writepages,
 	.sync_page	= block_sync_page,
 	.prepare_write	= fat_prepare_write,
 	.commit_write	= fat_commit_write,
-- 
cgit v1.2.2


From 7c709d00d614d0f2b6a80895b2a1aedbe04e8478 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Sun, 8 Jan 2006 01:02:10 -0800
Subject: [PATCH] fat: s/EXPORT_SYMBOL/EXPORT_SYMBOL_GPL/

All EXPORT_SYMBOL of fatfs is only for vfat/msdos. _GPL would be proper.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fat/dir.c    | 14 +++++++-------
 fs/fat/fatent.c |  2 +-
 fs/fat/file.c   |  2 +-
 fs/fat/inode.c  | 10 +++++-----
 fs/fat/misc.c   |  6 +++---
 5 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index b2a26cd226..4ce77475ae 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -418,7 +418,7 @@ EODir:
 	return err;
 }
 
-EXPORT_SYMBOL(fat_search_long);
+EXPORT_SYMBOL_GPL(fat_search_long);
 
 struct fat_ioctl_filldir_callback {
 	struct dirent __user *dirent;
@@ -780,7 +780,7 @@ int fat_get_dotdot_entry(struct inode *dir, struct buffer_head **bh,
 	return -ENOENT;
 }
 
-EXPORT_SYMBOL(fat_get_dotdot_entry);
+EXPORT_SYMBOL_GPL(fat_get_dotdot_entry);
 
 /* See if directory is empty */
 int fat_dir_empty(struct inode *dir)
@@ -803,7 +803,7 @@ int fat_dir_empty(struct inode *dir)
 	return result;
 }
 
-EXPORT_SYMBOL(fat_dir_empty);
+EXPORT_SYMBOL_GPL(fat_dir_empty);
 
 /*
  * fat_subdirs counts the number of sub-directories of dir. It can be run
@@ -849,7 +849,7 @@ int fat_scan(struct inode *dir, const unsigned char *name,
 	return -ENOENT;
 }
 
-EXPORT_SYMBOL(fat_scan);
+EXPORT_SYMBOL_GPL(fat_scan);
 
 static int __fat_remove_entries(struct inode *dir, loff_t pos, int nr_slots)
 {
@@ -936,7 +936,7 @@ int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo)
 	return 0;
 }
 
-EXPORT_SYMBOL(fat_remove_entries);
+EXPORT_SYMBOL_GPL(fat_remove_entries);
 
 static int fat_zeroed_cluster(struct inode *dir, sector_t blknr, int nr_used,
 			      struct buffer_head **bhs, int nr_bhs)
@@ -1048,7 +1048,7 @@ error:
 	return err;
 }
 
-EXPORT_SYMBOL(fat_alloc_new_dir);
+EXPORT_SYMBOL_GPL(fat_alloc_new_dir);
 
 static int fat_add_new_entries(struct inode *dir, void *slots, int nr_slots,
 			       int *nr_cluster, struct msdos_dir_entry **de,
@@ -1264,4 +1264,4 @@ error_remove:
 	return err;
 }
 
-EXPORT_SYMBOL(fat_add_entries);
+EXPORT_SYMBOL_GPL(fat_add_entries);
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 20a22875f0..a1a9e04512 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -581,7 +581,7 @@ error:
 	return err;
 }
 
-EXPORT_SYMBOL(fat_free_clusters);
+EXPORT_SYMBOL_GPL(fat_free_clusters);
 
 int fat_count_free_clusters(struct super_block *sb)
 {
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 7134403d5b..15229fe569 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -173,7 +173,7 @@ out:
 	return error;
 }
 
-EXPORT_SYMBOL(fat_notify_change);
+EXPORT_SYMBOL_GPL(fat_notify_change);
 
 /* Free all clusters after the skip'th cluster. */
 static int fat_free(struct inode *inode, int skip)
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index f502c6b8cb..932c8d6d1f 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -197,7 +197,7 @@ void fat_attach(struct inode *inode, loff_t i_pos)
 	spin_unlock(&sbi->inode_hash_lock);
 }
 
-EXPORT_SYMBOL(fat_attach);
+EXPORT_SYMBOL_GPL(fat_attach);
 
 void fat_detach(struct inode *inode)
 {
@@ -208,7 +208,7 @@ void fat_detach(struct inode *inode)
 	spin_unlock(&sbi->inode_hash_lock);
 }
 
-EXPORT_SYMBOL(fat_detach);
+EXPORT_SYMBOL_GPL(fat_detach);
 
 struct inode *fat_iget(struct super_block *sb, loff_t i_pos)
 {
@@ -362,7 +362,7 @@ out:
 	return inode;
 }
 
-EXPORT_SYMBOL(fat_build_inode);
+EXPORT_SYMBOL_GPL(fat_build_inode);
 
 static void fat_delete_inode(struct inode *inode)
 {
@@ -557,7 +557,7 @@ int fat_sync_inode(struct inode *inode)
 	return fat_write_inode(inode, 1);
 }
 
-EXPORT_SYMBOL(fat_sync_inode);
+EXPORT_SYMBOL_GPL(fat_sync_inode);
 
 static int fat_show_options(struct seq_file *m, struct vfsmount *mnt);
 static struct super_operations fat_sops = {
@@ -1368,7 +1368,7 @@ out_fail:
 	return error;
 }
 
-EXPORT_SYMBOL(fat_fill_super);
+EXPORT_SYMBOL_GPL(fat_fill_super);
 
 int __init fat_cache_init(void);
 void fat_cache_destroy(void);
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 9b592e37e2..32fb0a3f1d 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -33,7 +33,7 @@ void fat_fs_panic(struct super_block *s, const char *fmt, ...)
 	}
 }
 
-EXPORT_SYMBOL(fat_fs_panic);
+EXPORT_SYMBOL_GPL(fat_fs_panic);
 
 /* Flushes the number of free clusters on FAT32 */
 /* XXX: Need to write one per FSINFO block.  Currently only writes 1 */
@@ -192,7 +192,7 @@ void fat_date_unix2dos(int unix_date, __le16 *time, __le16 *date)
 	*date = cpu_to_le16(nl_day-day_n[month-1]+1+(month << 5)+(year << 9));
 }
 
-EXPORT_SYMBOL(fat_date_unix2dos);
+EXPORT_SYMBOL_GPL(fat_date_unix2dos);
 
 int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs)
 {
@@ -220,4 +220,4 @@ int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs)
 	return err;
 }
 
-EXPORT_SYMBOL(fat_sync_bhs);
+EXPORT_SYMBOL_GPL(fat_sync_bhs);
-- 
cgit v1.2.2


From e5174baaea7585760f02eef23b225847d209a8db Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Sun, 8 Jan 2006 01:02:11 -0800
Subject: [PATCH] fat: support ->direct_IO()

This patch add to support of ->direct_IO() for mostly read.

The user of this seems to want to use for streaming read.  So, current direct
I/O has limitation, it can only overwrite.  (For write operation, mainly we
need to handle the hole etc..)

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fat/cache.c           | 14 +++++++--
 fs/fat/dir.c             |  6 ++--
 fs/fat/inode.c           | 82 ++++++++++++++++++++++++++++++++++++++++++------
 include/linux/msdos_fs.h |  3 +-
 4 files changed, 89 insertions(+), 16 deletions(-)

diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 77c24fcf71..1acc941245 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -295,7 +295,8 @@ static int fat_bmap_cluster(struct inode *inode, int cluster)
 	return dclus;
 }
 
-int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys)
+int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
+	     unsigned long *mapped_blocks)
 {
 	struct super_block *sb = inode->i_sb;
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
@@ -303,9 +304,12 @@ int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys)
 	int cluster, offset;
 
 	*phys = 0;
+	*mapped_blocks = 0;
 	if ((sbi->fat_bits != 32) && (inode->i_ino == MSDOS_ROOT_INO)) {
-		if (sector < (sbi->dir_entries >> sbi->dir_per_block_bits))
+		if (sector < (sbi->dir_entries >> sbi->dir_per_block_bits)) {
 			*phys = sector + sbi->dir_start;
+			*mapped_blocks = 1;
+		}
 		return 0;
 	}
 	last_block = (MSDOS_I(inode)->mmu_private + (sb->s_blocksize - 1))
@@ -318,7 +322,11 @@ int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys)
 	cluster = fat_bmap_cluster(inode, cluster);
 	if (cluster < 0)
 		return cluster;
-	else if (cluster)
+	else if (cluster) {
 		*phys = fat_clus_to_blknr(sbi, cluster) + offset;
+		*mapped_blocks = sbi->sec_per_clus - offset;
+		if (*mapped_blocks > last_block - sector)
+			*mapped_blocks = last_block - sector;
+	}
 	return 0;
 }
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 4ce77475ae..eef1b81aa2 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -68,8 +68,8 @@ static int fat__get_entry(struct inode *dir, loff_t *pos,
 {
 	struct super_block *sb = dir->i_sb;
 	sector_t phys, iblock;
-	int offset;
-	int err;
+	unsigned long mapped_blocks;
+	int err, offset;
 
 next:
 	if (*bh)
@@ -77,7 +77,7 @@ next:
 
 	*bh = NULL;
 	iblock = *pos >> sb->s_blocksize_bits;
-	err = fat_bmap(dir, iblock, &phys);
+	err = fat_bmap(dir, iblock, &phys, &mapped_blocks);
 	if (err || !phys)
 		return -1;	/* beyond EOF or error */
 
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 932c8d6d1f..e7f4aa7fc6 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -23,6 +23,7 @@
 #include <linux/mount.h>
 #include <linux/vfs.h>
 #include <linux/parser.h>
+#include <linux/uio.h>
 #include <asm/unaligned.h>
 
 #ifndef CONFIG_FAT_DEFAULT_IOCHARSET
@@ -49,43 +50,77 @@ static int fat_add_cluster(struct inode *inode)
 	return err;
 }
 
-static int fat_get_block(struct inode *inode, sector_t iblock,
-			 struct buffer_head *bh_result, int create)
+static int __fat_get_blocks(struct inode *inode, sector_t iblock,
+			    unsigned long *max_blocks,
+			    struct buffer_head *bh_result, int create)
 {
 	struct super_block *sb = inode->i_sb;
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
 	sector_t phys;
-	int err;
+	unsigned long mapped_blocks;
+	int err, offset;
 
-	err = fat_bmap(inode, iblock, &phys);
+	err = fat_bmap(inode, iblock, &phys, &mapped_blocks);
 	if (err)
 		return err;
 	if (phys) {
 		map_bh(bh_result, sb, phys);
+		*max_blocks = min(mapped_blocks, *max_blocks);
 		return 0;
 	}
 	if (!create)
 		return 0;
+
 	if (iblock != MSDOS_I(inode)->mmu_private >> sb->s_blocksize_bits) {
 		fat_fs_panic(sb, "corrupted file size (i_pos %lld, %lld)",
 			     MSDOS_I(inode)->i_pos, MSDOS_I(inode)->mmu_private);
 		return -EIO;
 	}
-	if (!((unsigned long)iblock & (MSDOS_SB(sb)->sec_per_clus - 1))) {
+
+	offset = (unsigned long)iblock & (sbi->sec_per_clus - 1);
+	if (!offset) {
+		/* TODO: multiple cluster allocation would be desirable. */
 		err = fat_add_cluster(inode);
 		if (err)
 			return err;
 	}
-	MSDOS_I(inode)->mmu_private += sb->s_blocksize;
-	err = fat_bmap(inode, iblock, &phys);
+	/* available blocks on this cluster */
+	mapped_blocks = sbi->sec_per_clus - offset;
+
+	*max_blocks = min(mapped_blocks, *max_blocks);
+	MSDOS_I(inode)->mmu_private += *max_blocks << sb->s_blocksize_bits;
+
+	err = fat_bmap(inode, iblock, &phys, &mapped_blocks);
 	if (err)
 		return err;
-	if (!phys)
-		BUG();
+	BUG_ON(!phys);
+	BUG_ON(*max_blocks != mapped_blocks);
 	set_buffer_new(bh_result);
 	map_bh(bh_result, sb, phys);
 	return 0;
 }
 
+static int fat_get_blocks(struct inode *inode, sector_t iblock,
+			  unsigned long max_blocks,
+			  struct buffer_head *bh_result, int create)
+{
+	struct super_block *sb = inode->i_sb;
+	int err;
+
+	err = __fat_get_blocks(inode, iblock, &max_blocks, bh_result, create);
+	if (err)
+		return err;
+	bh_result->b_size = max_blocks << sb->s_blocksize_bits;
+	return 0;
+}
+
+static int fat_get_block(struct inode *inode, sector_t iblock,
+			 struct buffer_head *bh_result, int create)
+{
+	unsigned long max_blocks = 1;
+	return __fat_get_blocks(inode, iblock, &max_blocks, bh_result, create);
+}
+
 static int fat_writepage(struct page *page, struct writeback_control *wbc)
 {
 	return block_write_full_page(page, fat_get_block, wbc);
@@ -128,6 +163,34 @@ static int fat_commit_write(struct file *file, struct page *page,
 	return err;
 }
 
+static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
+			     const struct iovec *iov,
+			     loff_t offset, unsigned long nr_segs)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+
+	if (rw == WRITE) {
+		/*
+		 * FIXME: blockdev_direct_IO() doesn't use ->prepare_write(),
+		 * so we need to update the ->mmu_private to block boundary.
+		 *
+		 * But we must fill the remaining area or hole by nul for
+		 * updating ->mmu_private.
+		 */
+		loff_t size = offset + iov_length(iov, nr_segs);
+		if (MSDOS_I(inode)->mmu_private < size)
+			return -EINVAL;
+	}
+
+	/*
+	 * FAT need to use the DIO_LOCKING for avoiding the race
+	 * condition of fat_get_block() and ->truncate().
+	 */
+	return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+				  offset, nr_segs, fat_get_blocks, NULL);
+}
+
 static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
 {
 	return generic_block_bmap(mapping, block, fat_get_block);
@@ -141,6 +204,7 @@ static struct address_space_operations fat_aops = {
 	.sync_page	= block_sync_page,
 	.prepare_write	= fat_prepare_write,
 	.commit_write	= fat_commit_write,
+	.direct_IO	= fat_direct_IO,
 	.bmap		= _fat_bmap
 };
 
diff --git a/include/linux/msdos_fs.h b/include/linux/msdos_fs.h
index 941da5c016..e933e2a355 100644
--- a/include/linux/msdos_fs.h
+++ b/include/linux/msdos_fs.h
@@ -329,7 +329,8 @@ static inline void fatwchar_to16(__u8 *dst, const wchar_t *src, size_t len)
 extern void fat_cache_inval_inode(struct inode *inode);
 extern int fat_get_cluster(struct inode *inode, int cluster,
 			   int *fclus, int *dclus);
-extern int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys);
+extern int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
+		    unsigned long *mapped_blocks);
 
 /* fat/dir.c */
 extern struct file_operations fat_dir_operations;
-- 
cgit v1.2.2


From 268fc16e343b4f8e249468747db2e658da46a814 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Sun, 8 Jan 2006 01:02:12 -0800
Subject: [PATCH] export/change sync_page_range/_nolock()

This exports/changes the sync_page_range/_nolock().  The fatfs needs
sync_page_range/_nolock() for expanding truncate, and changes "size_t count"
to "loff_t count".

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/writeback.h | 4 +++-
 mm/filemap.c              | 8 ++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index b096159086..beaef5c7a0 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -103,7 +103,9 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping);
 int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0);
 int do_writepages(struct address_space *mapping, struct writeback_control *wbc);
 int sync_page_range(struct inode *inode, struct address_space *mapping,
-			loff_t pos, size_t count);
+			loff_t pos, loff_t count);
+int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
+			   loff_t pos, loff_t count);
 
 /* pdflush.c */
 extern int nr_pdflush_threads;	/* Global so it can be exported to sysctl
diff --git a/mm/filemap.c b/mm/filemap.c
index 4ef24a3976..8fdf365080 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -280,7 +280,7 @@ static int wait_on_page_writeback_range(struct address_space *mapping,
  * it is otherwise livelockable.
  */
 int sync_page_range(struct inode *inode, struct address_space *mapping,
-			loff_t pos, size_t count)
+			loff_t pos, loff_t count)
 {
 	pgoff_t start = pos >> PAGE_CACHE_SHIFT;
 	pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
@@ -305,9 +305,8 @@ EXPORT_SYMBOL(sync_page_range);
  * as it forces O_SYNC writers to different parts of the same file
  * to be serialised right until io completion.
  */
-static int sync_page_range_nolock(struct inode *inode,
-				  struct address_space *mapping,
-				  loff_t pos, size_t count)
+int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
+			   loff_t pos, loff_t count)
 {
 	pgoff_t start = pos >> PAGE_CACHE_SHIFT;
 	pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
@@ -322,6 +321,7 @@ static int sync_page_range_nolock(struct inode *inode,
 		ret = wait_on_page_writeback_range(mapping, start, end);
 	return ret;
 }
+EXPORT_SYMBOL(sync_page_range_nolock);
 
 /**
  * filemap_fdatawait - walk the list of under-writeback pages of the given
-- 
cgit v1.2.2


From 05eb0b51fb46430050d5873458612f53e0234f2e Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Sun, 8 Jan 2006 01:02:13 -0800
Subject: [PATCH] fat: support a truncate() for expanding size
 (generic_cont_expand)

This patch changes generic_cont_expand(), in order to share the code
with fatfs.

  - Use vmtruncate() if ->prepare_write() returns a error.

Even if ->prepare_write() returns an error, it may already have added some
blocks.  So, this truncates blocks outside of ->i_size by vmtruncate().

  - Add generic_cont_expand_simple().

The generic_cont_expand_simple() assumes that ->prepare_write() can handle
the block boundary.  With this, we don't need to care the extra byte.

And for expanding a file size by truncate(), fatfs uses the
added generic_cont_expand_simple().

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/buffer.c                 | 60 ++++++++++++++++++++++++++++++++++-----------
 fs/fat/file.c               | 31 ++++++++++++++++++++---
 include/linux/buffer_head.h |  3 ++-
 3 files changed, 76 insertions(+), 18 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index 5287be1863..55023231e4 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2160,11 +2160,12 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
  * truncates.  Uses prepare/commit_write to allow the filesystem to
  * deal with the hole.  
  */
-int generic_cont_expand(struct inode *inode, loff_t size)
+static int __generic_cont_expand(struct inode *inode, loff_t size,
+				 pgoff_t index, unsigned int offset)
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct page *page;
-	unsigned long index, offset, limit;
+	unsigned long limit;
 	int err;
 
 	err = -EFBIG;
@@ -2176,24 +2177,24 @@ int generic_cont_expand(struct inode *inode, loff_t size)
 	if (size > inode->i_sb->s_maxbytes)
 		goto out;
 
-	offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
-
-	/* ugh.  in prepare/commit_write, if from==to==start of block, we 
-	** skip the prepare.  make sure we never send an offset for the start
-	** of a block
-	*/
-	if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
-		offset++;
-	}
-	index = size >> PAGE_CACHE_SHIFT;
 	err = -ENOMEM;
 	page = grab_cache_page(mapping, index);
 	if (!page)
 		goto out;
 	err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
-	if (!err) {
-		err = mapping->a_ops->commit_write(NULL, page, offset, offset);
+	if (err) {
+		/*
+		 * ->prepare_write() may have instantiated a few blocks
+		 * outside i_size.  Trim these off again.
+		 */
+		unlock_page(page);
+		page_cache_release(page);
+		vmtruncate(inode, inode->i_size);
+		goto out;
 	}
+
+	err = mapping->a_ops->commit_write(NULL, page, offset, offset);
+
 	unlock_page(page);
 	page_cache_release(page);
 	if (err > 0)
@@ -2202,6 +2203,36 @@ out:
 	return err;
 }
 
+int generic_cont_expand(struct inode *inode, loff_t size)
+{
+	pgoff_t index;
+	unsigned int offset;
+
+	offset = (size & (PAGE_CACHE_SIZE - 1)); /* Within page */
+
+	/* ugh.  in prepare/commit_write, if from==to==start of block, we
+	** skip the prepare.  make sure we never send an offset for the start
+	** of a block
+	*/
+	if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
+		/* caller must handle this extra byte. */
+		offset++;
+	}
+	index = size >> PAGE_CACHE_SHIFT;
+
+	return __generic_cont_expand(inode, size, index, offset);
+}
+
+int generic_cont_expand_simple(struct inode *inode, loff_t size)
+{
+	loff_t pos = size - 1;
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	unsigned int offset = (pos & (PAGE_CACHE_SIZE - 1)) + 1;
+
+	/* prepare/commit_write can handle even if from==to==start of block. */
+	return __generic_cont_expand(inode, size, index, offset);
+}
+
 /*
  * For moronic filesystems that do not allow holes in file.
  * We may have to extend the file.
@@ -3145,6 +3176,7 @@ EXPORT_SYMBOL(fsync_bdev);
 EXPORT_SYMBOL(generic_block_bmap);
 EXPORT_SYMBOL(generic_commit_write);
 EXPORT_SYMBOL(generic_cont_expand);
+EXPORT_SYMBOL(generic_cont_expand_simple);
 EXPORT_SYMBOL(init_buffer);
 EXPORT_SYMBOL(invalidate_bdev);
 EXPORT_SYMBOL(ll_rw_block);
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 15229fe569..9b07c328a6 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -11,6 +11,7 @@
 #include <linux/msdos_fs.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
+#include <linux/writeback.h>
 
 int fat_generic_ioctl(struct inode *inode, struct file *filp,
 		      unsigned int cmd, unsigned long arg)
@@ -124,6 +125,24 @@ struct file_operations fat_file_operations = {
 	.sendfile	= generic_file_sendfile,
 };
 
+static int fat_cont_expand(struct inode *inode, loff_t size)
+{
+	struct address_space *mapping = inode->i_mapping;
+	loff_t start = inode->i_size, count = size - inode->i_size;
+	int err;
+
+	err = generic_cont_expand_simple(inode, size);
+	if (err)
+		goto out;
+
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
+	mark_inode_dirty(inode);
+	if (IS_SYNC(inode))
+		err = sync_page_range_nolock(inode, mapping, start, count);
+out:
+	return err;
+}
+
 int fat_notify_change(struct dentry *dentry, struct iattr *attr)
 {
 	struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb);
@@ -132,11 +151,17 @@ int fat_notify_change(struct dentry *dentry, struct iattr *attr)
 
 	lock_kernel();
 
-	/* FAT cannot truncate to a longer file */
+	/*
+	 * Expand the file. Since inode_setattr() updates ->i_size
+	 * before calling the ->truncate(), but FAT needs to fill the
+	 * hole before it.
+	 */
 	if (attr->ia_valid & ATTR_SIZE) {
 		if (attr->ia_size > inode->i_size) {
-			error = -EPERM;
-			goto out;
+			error = fat_cont_expand(inode, attr->ia_size);
+			if (error || attr->ia_valid == ATTR_SIZE)
+				goto out;
+			attr->ia_valid &= ~ATTR_SIZE;
 		}
 	}
 
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 1db061bb6b..9f159baf15 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -197,7 +197,8 @@ int block_read_full_page(struct page*, get_block_t*);
 int block_prepare_write(struct page*, unsigned, unsigned, get_block_t*);
 int cont_prepare_write(struct page*, unsigned, unsigned, get_block_t*,
 				loff_t *);
-int generic_cont_expand(struct inode *inode, loff_t size) ;
+int generic_cont_expand(struct inode *inode, loff_t size);
+int generic_cont_expand_simple(struct inode *inode, loff_t size);
 int block_commit_write(struct page *page, unsigned from, unsigned to);
 int block_sync_page(struct page *);
 sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *);
-- 
cgit v1.2.2


From 28fd129827b00e12829d48a5290f46277600619b Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Sun, 8 Jan 2006 01:02:14 -0800
Subject: [PATCH] Fix and add EXPORT_SYMBOL(filemap_write_and_wait)

This patch add EXPORT_SYMBOL(filemap_write_and_wait) and use it.

See mm/filemap.c:

And changes the filemap_write_and_wait() and filemap_write_and_wait_range().

Current filemap_write_and_wait() doesn't wait if filemap_fdatawrite()
returns error.  However, even if filemap_fdatawrite() returned an
error, it may have submitted the partially data pages to the device.
(e.g. in the case of -ENOSPC)

<quotation>
Andrew Morton writes,

If filemap_fdatawrite() returns an error, this might be due to some
I/O problem: dead disk, unplugged cable, etc.  Given the generally
crappy quality of the kernel's handling of such exceptions, there's a
good chance that the filemap_fdatawait() will get stuck in D state
forever.
</quotation>

So, this patch doesn't wait if filemap_fdatawrite() returns the -EIO.

Trond, could you please review the nfs part?  Especially I'm not sure,
nfs must use the "filemap_fdatawrite(inode->i_mapping) == 0", or not.

Acked-by: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/9p/vfs_dir.c                |  3 +--
 fs/9p/vfs_file.c               |  3 +--
 fs/buffer.c                    | 10 ++--------
 fs/cifs/file.c                 |  6 ++----
 fs/cifs/inode.c                |  3 +--
 fs/jfs/jfs_dmap.c              |  3 +--
 fs/jfs/jfs_imap.c              |  6 ++----
 fs/jfs/jfs_txnmgr.c            |  6 ++----
 fs/jfs/jfs_umount.c            |  6 ++----
 fs/jfs/resize.c                |  3 +--
 fs/jfs/super.c                 |  3 +--
 fs/nfs/inode.c                 |  8 ++------
 fs/smbfs/file.c                |  3 +--
 fs/smbfs/inode.c               |  3 +--
 fs/xfs/linux-2.6/xfs_fs_subr.c |  3 +--
 mm/filemap.c                   | 40 +++++++++++++++++++++++++++-------------
 16 files changed, 48 insertions(+), 61 deletions(-)

diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 57a43b8fee..17089d1905 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -193,8 +193,7 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
 		fid->fid);
 	fidnum = fid->fid;
 
-	filemap_fdatawrite(inode->i_mapping);
-	filemap_fdatawait(inode->i_mapping);
+	filemap_write_and_wait(inode->i_mapping);
 
 	if (fidnum >= 0) {
 		dprintk(DEBUG_VFS, "fidopen: %d v9f->fid: %d\n", fid->fidopen,
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 89c849da85..e13577da91 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -165,8 +165,7 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
 		return -ENOLCK;
 
 	if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
-		filemap_fdatawrite(inode->i_mapping);
-		filemap_fdatawait(inode->i_mapping);
+		filemap_write_and_wait(inode->i_mapping);
 		invalidate_inode_pages(&inode->i_data);
 	}
 
diff --git a/fs/buffer.c b/fs/buffer.c
index 55023231e4..263df0f192 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -153,14 +153,8 @@ int sync_blockdev(struct block_device *bdev)
 {
 	int ret = 0;
 
-	if (bdev) {
-		int err;
-
-		ret = filemap_fdatawrite(bdev->bd_inode->i_mapping);
-		err = filemap_fdatawait(bdev->bd_inode->i_mapping);
-		if (!ret)
-			ret = err;
-	}
+	if (bdev)
+		ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
 	return ret;
 }
 EXPORT_SYMBOL(sync_blockdev);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 14a1c72ced..5ade53d7bc 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -127,8 +127,7 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
 		if (file->f_dentry->d_inode->i_mapping) {
 		/* BB no need to lock inode until after invalidate
 		   since namei code should already have it locked? */
-			filemap_fdatawrite(file->f_dentry->d_inode->i_mapping);
-			filemap_fdatawait(file->f_dentry->d_inode->i_mapping);
+			filemap_write_and_wait(file->f_dentry->d_inode->i_mapping);
 		}
 		cFYI(1, ("invalidating remote inode since open detected it "
 			 "changed"));
@@ -419,8 +418,7 @@ static int cifs_reopen_file(struct inode *inode, struct file *file,
 		pCifsInode = CIFS_I(inode);
 		if (pCifsInode) {
 			if (can_flush) {
-				filemap_fdatawrite(inode->i_mapping);
-				filemap_fdatawait(inode->i_mapping);
+				filemap_write_and_wait(inode->i_mapping);
 			/* temporarily disable caching while we
 			   go to server to get inode info */
 				pCifsInode->clientCanCacheAll = FALSE;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 411c1f7f84..9558f51bca 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1148,8 +1148,7 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 	/* BB check if we need to refresh inode from server now ? BB */
 
 	/* need to flush data before changing file size on server */
-	filemap_fdatawrite(direntry->d_inode->i_mapping);
-	filemap_fdatawait(direntry->d_inode->i_mapping);
+	filemap_write_and_wait(direntry->d_inode->i_mapping);
 
 	if (attrs->ia_valid & ATTR_SIZE) {
 		/* To avoid spurious oplock breaks from server, in the case of
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 68000a50ce..2967b73934 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -302,8 +302,7 @@ int dbSync(struct inode *ipbmap)
 	/*
 	 * write out dirty pages of bmap
 	 */
-	filemap_fdatawrite(ipbmap->i_mapping);
-	filemap_fdatawait(ipbmap->i_mapping);
+	filemap_write_and_wait(ipbmap->i_mapping);
 
 	diWriteSpecial(ipbmap, 0);
 
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 28201b194f..31b4aa13dd 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -265,8 +265,7 @@ int diSync(struct inode *ipimap)
 	/*
 	 * write out dirty pages of imap
 	 */
-	filemap_fdatawrite(ipimap->i_mapping);
-	filemap_fdatawait(ipimap->i_mapping);
+	filemap_write_and_wait(ipimap->i_mapping);
 
 	diWriteSpecial(ipimap, 0);
 
@@ -565,8 +564,7 @@ void diFreeSpecial(struct inode *ip)
 		jfs_err("diFreeSpecial called with NULL ip!");
 		return;
 	}
-	filemap_fdatawrite(ip->i_mapping);
-	filemap_fdatawait(ip->i_mapping);
+	filemap_write_and_wait(ip->i_mapping);
 	truncate_inode_pages(ip->i_mapping, 0);
 	iput(ip);
 }
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index b660c93c92..2ddb6b892b 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -1231,10 +1231,8 @@ int txCommit(tid_t tid,		/* transaction identifier */
 		 * when we don't need to worry about it at all.
 		 *
 		 * if ((!S_ISDIR(ip->i_mode))
-		 *    && (tblk->flag & COMMIT_DELETE) == 0) {
-		 *	filemap_fdatawrite(ip->i_mapping);
-		 *	filemap_fdatawait(ip->i_mapping);
-		 * }
+		 *    && (tblk->flag & COMMIT_DELETE) == 0)
+		 *	filemap_write_and_wait(ip->i_mapping);
 		 */
 
 		/*
diff --git a/fs/jfs/jfs_umount.c b/fs/jfs/jfs_umount.c
index 5cf91785b5..21eaf7ac0f 100644
--- a/fs/jfs/jfs_umount.c
+++ b/fs/jfs/jfs_umount.c
@@ -108,8 +108,7 @@ int jfs_umount(struct super_block *sb)
 	 * Make sure all metadata makes it to disk before we mark
 	 * the superblock as clean
 	 */
-	filemap_fdatawrite(sbi->direct_inode->i_mapping);
-	filemap_fdatawait(sbi->direct_inode->i_mapping);
+	filemap_write_and_wait(sbi->direct_inode->i_mapping);
 
 	/*
 	 * ensure all file system file pages are propagated to their
@@ -161,8 +160,7 @@ int jfs_umount_rw(struct super_block *sb)
 	 * mark the superblock clean before everything is flushed to
 	 * disk.
 	 */
-	filemap_fdatawrite(sbi->direct_inode->i_mapping);
-	filemap_fdatawait(sbi->direct_inode->i_mapping);
+	filemap_write_and_wait(sbi->direct_inode->i_mapping);
 
 	updateSuper(sb, FM_CLEAN);
 
diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c
index c6dc254d32..4518036187 100644
--- a/fs/jfs/resize.c
+++ b/fs/jfs/resize.c
@@ -376,8 +376,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
 	 * by txCommit();
 	 */
 	filemap_fdatawait(ipbmap->i_mapping);
-	filemap_fdatawrite(ipbmap->i_mapping);
-	filemap_fdatawait(ipbmap->i_mapping);
+	filemap_write_and_wait(ipbmap->i_mapping);
 	diWriteSpecial(ipbmap, 0);
 
 	newPage = nPages;	/* first new page number */
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 4226af3ea9..8d31f13364 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -502,8 +502,7 @@ out_no_rw:
 		jfs_err("jfs_umount failed with return code %d", rc);
 	}
 out_mount_failed:
-	filemap_fdatawrite(sbi->direct_inode->i_mapping);
-	filemap_fdatawait(sbi->direct_inode->i_mapping);
+	filemap_write_and_wait(sbi->direct_inode->i_mapping);
 	truncate_inode_pages(sbi->direct_inode->i_mapping, 0);
 	make_bad_inode(sbi->direct_inode);
 	iput(sbi->direct_inode);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index e7bd0d9260..3e4ba9cb7f 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -644,10 +644,7 @@ int nfs_sync_mapping(struct address_space *mapping)
 	if (mapping->nrpages == 0)
 		return 0;
 	unmap_mapping_range(mapping, 0, 0, 0);
-	ret = filemap_fdatawrite(mapping);
-	if (ret != 0)
-		goto out;
-	ret = filemap_fdatawait(mapping);
+	ret = filemap_write_and_wait(mapping);
 	if (ret != 0)
 		goto out;
 	ret = nfs_wb_all(mapping->host);
@@ -864,8 +861,7 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
 	nfs_begin_data_update(inode);
 	/* Write all dirty data if we're changing file permissions or size */
 	if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE)) != 0) {
-		if (filemap_fdatawrite(inode->i_mapping) == 0)
-			filemap_fdatawait(inode->i_mapping);
+		filemap_write_and_wait(inode->i_mapping);
 		nfs_wb_all(inode);
 	}
 	/*
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index b4fcfa8b55..3c6eb3ba71 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -374,8 +374,7 @@ smb_file_release(struct inode *inode, struct file * file)
 		/* We must flush any dirty pages now as we won't be able to
 		   write anything after close. mmap can trigger this.
 		   "openers" should perhaps include mmap'ers ... */
-		filemap_fdatawrite(inode->i_mapping);
-		filemap_fdatawait(inode->i_mapping);
+		filemap_write_and_wait(inode->i_mapping);
 		smb_close(inode);
 	}
 	unlock_kernel();
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index 10b994428f..6ec88bf59b 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -697,8 +697,7 @@ smb_notify_change(struct dentry *dentry, struct iattr *attr)
 			DENTRY_PATH(dentry),
 			(long) inode->i_size, (long) attr->ia_size);
 
-		filemap_fdatawrite(inode->i_mapping);
-		filemap_fdatawait(inode->i_mapping);
+		filemap_write_and_wait(inode->i_mapping);
 
 		error = smb_open(dentry, O_WRONLY);
 		if (error)
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index f89340c61b..4fa4b1a518 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -79,8 +79,7 @@ fs_flushinval_pages(
 	struct inode	*ip = LINVFS_GET_IP(vp);
 
 	if (VN_CACHED(vp)) {
-		filemap_fdatawrite(ip->i_mapping);
-		filemap_fdatawait(ip->i_mapping);
+		filemap_write_and_wait(ip->i_mapping);
 
 		truncate_inode_pages(ip->i_mapping, first);
 	}
diff --git a/mm/filemap.c b/mm/filemap.c
index 8fdf365080..478f4c74cc 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -343,30 +343,44 @@ EXPORT_SYMBOL(filemap_fdatawait);
 
 int filemap_write_and_wait(struct address_space *mapping)
 {
-	int retval = 0;
+	int err = 0;
 
 	if (mapping->nrpages) {
-		retval = filemap_fdatawrite(mapping);
-		if (retval == 0)
-			retval = filemap_fdatawait(mapping);
+		err = filemap_fdatawrite(mapping);
+		/*
+		 * Even if the above returned error, the pages may be
+		 * written partially (e.g. -ENOSPC), so we wait for it.
+		 * But the -EIO is special case, it may indicate the worst
+		 * thing (e.g. bug) happened, so we avoid waiting for it.
+		 */
+		if (err != -EIO) {
+			int err2 = filemap_fdatawait(mapping);
+			if (!err)
+				err = err2;
+		}
 	}
-	return retval;
+	return err;
 }
+EXPORT_SYMBOL(filemap_write_and_wait);
 
 int filemap_write_and_wait_range(struct address_space *mapping,
 				 loff_t lstart, loff_t lend)
 {
-	int retval = 0;
+	int err = 0;
 
 	if (mapping->nrpages) {
-		retval = __filemap_fdatawrite_range(mapping, lstart, lend,
-						    WB_SYNC_ALL);
-		if (retval == 0)
-			retval = wait_on_page_writeback_range(mapping,
-						    lstart >> PAGE_CACHE_SHIFT,
-						    lend >> PAGE_CACHE_SHIFT);
+		err = __filemap_fdatawrite_range(mapping, lstart, lend,
+						 WB_SYNC_ALL);
+		/* See comment of filemap_write_and_wait() */
+		if (err != -EIO) {
+			int err2 = wait_on_page_writeback_range(mapping,
+						lstart >> PAGE_CACHE_SHIFT,
+						lend >> PAGE_CACHE_SHIFT);
+			if (!err)
+				err = err2;
+		}
 	}
-	return retval;
+	return err;
 }
 
 /*
-- 
cgit v1.2.2


From 2a10e0b28b196051ae71829e5b989cba00513289 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Sun, 8 Jan 2006 01:02:15 -0800
Subject: [PATCH] move rtc_interrupt() prototype to rtc.h

This patch moves the rtc_interrupt() prototype to rtc.h and removes the
prototypes from C files.

It also renames static rtc_interrupt() functions in
arch/arm/mach-integrator/time.c and arch/sh64/kernel/time.c to avoid compile
problems.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Paul Gortmaker <p_gortmaker@yahoo.com>
Acked-by: Paul Mundt <lethal@linux-sh.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/arm/mach-integrator/time.c | 5 +++--
 arch/i386/kernel/time_hpet.c    | 2 --
 arch/sh64/kernel/time.c         | 7 ++++---
 arch/x86_64/kernel/time.c       | 2 --
 include/linux/rtc.h             | 3 +++
 5 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/arch/arm/mach-integrator/time.c b/arch/arm/mach-integrator/time.c
index 9f46aaef89..3c22c16b38 100644
--- a/arch/arm/mach-integrator/time.c
+++ b/arch/arm/mach-integrator/time.c
@@ -96,7 +96,8 @@ static struct rtc_ops rtc_ops = {
 	.set_alarm	= rtc_set_alarm,
 };
 
-static irqreturn_t rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+static irqreturn_t arm_rtc_interrupt(int irq, void *dev_id,
+				     struct pt_regs *regs)
 {
 	writel(0, rtc_base + RTC_EOI);
 	return IRQ_HANDLED;
@@ -124,7 +125,7 @@ static int rtc_probe(struct amba_device *dev, void *id)
 
 	xtime.tv_sec = __raw_readl(rtc_base + RTC_DR);
 
-	ret = request_irq(dev->irq[0], rtc_interrupt, SA_INTERRUPT,
+	ret = request_irq(dev->irq[0], arm_rtc_interrupt, SA_INTERRUPT,
 			  "rtc-pl030", dev);
 	if (ret)
 		goto map_out;
diff --git a/arch/i386/kernel/time_hpet.c b/arch/i386/kernel/time_hpet.c
index 9caeaa315c..a529f0cdce 100644
--- a/arch/i386/kernel/time_hpet.c
+++ b/arch/i386/kernel/time_hpet.c
@@ -259,8 +259,6 @@ __setup("hpet=", hpet_setup);
 #include <linux/mc146818rtc.h>
 #include <linux/rtc.h>
 
-extern irqreturn_t rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs);
-
 #define DEFAULT_RTC_INT_FREQ 	64
 #define RTC_NUM_INTS 		1
 
diff --git a/arch/sh64/kernel/time.c b/arch/sh64/kernel/time.c
index 870fe5327e..1195af37ee 100644
--- a/arch/sh64/kernel/time.c
+++ b/arch/sh64/kernel/time.c
@@ -417,7 +417,7 @@ static __init unsigned int get_cpu_hz(void)
 	/*
 	** Regardless the toolchain, force the compiler to use the
 	** arbitrary register r3 as a clock tick counter.
-	** NOTE: r3 must be in accordance with rtc_interrupt()
+	** NOTE: r3 must be in accordance with sh64_rtc_interrupt()
 	*/
 	register unsigned long long  __rtc_irq_flag __asm__ ("r3");
 
@@ -482,7 +482,8 @@ static __init unsigned int get_cpu_hz(void)
 #endif
 }
 
-static irqreturn_t rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+static irqreturn_t sh64_rtc_interrupt(int irq, void *dev_id,
+				      struct pt_regs *regs)
 {
 	ctrl_outb(0, RCR1);	/* Disable Carry Interrupts */
 	regs->regs[3] = 1;	/* Using r3 */
@@ -491,7 +492,7 @@ static irqreturn_t rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 }
 
 static struct irqaction irq0  = { timer_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "timer", NULL, NULL};
-static struct irqaction irq1  = { rtc_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "rtc", NULL, NULL};
+static struct irqaction irq1  = { sh64_rtc_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "rtc", NULL, NULL};
 
 void __init time_init(void)
 {
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 74102796e5..43c9fa0f8d 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -1075,8 +1075,6 @@ device_initcall(time_init_device);
  */
 #include <linux/rtc.h>
 
-extern irqreturn_t rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs);
-
 #define DEFAULT_RTC_INT_FREQ 	64
 #define RTC_NUM_INTS 		1
 
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index e1aaf1fac8..0b2ba67ff1 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -11,6 +11,8 @@
 #ifndef _LINUX_RTC_H_
 #define _LINUX_RTC_H_
 
+#include <linux/interrupt.h>
+
 /*
  * The struct used to pass data via the following ioctl. Similar to the
  * struct tm in <time.h>, but it needs to be here so that the kernel 
@@ -102,6 +104,7 @@ int rtc_register(rtc_task_t *task);
 int rtc_unregister(rtc_task_t *task);
 int rtc_control(rtc_task_t *t, unsigned int cmd, unsigned long arg);
 void rtc_get_rtc_time(struct rtc_time *rtc_tm);
+irqreturn_t rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs);
 
 #endif /* __KERNEL__ */
 
-- 
cgit v1.2.2


From b7b4d7a4666454b40b45a853bd1d296af37a85f0 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Sun, 8 Jan 2006 01:02:16 -0800
Subject: [PATCH] drivers/isdn/: "extern inline" -> "static inline"

"extern inline" -> "static inline"

Since there's no pullphone() function this patch removes the dead
prototype.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Acked-by: Karsten Keil <kkeil@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/isdn/act2000/act2000.h | 6 +++---
 drivers/isdn/act2000/capi.h    | 2 +-
 drivers/isdn/sc/command.c      | 1 -
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/isdn/act2000/act2000.h b/drivers/isdn/act2000/act2000.h
index b091d1a541..d4c50512a1 100644
--- a/drivers/isdn/act2000/act2000.h
+++ b/drivers/isdn/act2000/act2000.h
@@ -181,17 +181,17 @@ typedef struct act2000_card {
 	char regname[35];		/* Name used for request_region     */
 } act2000_card;
 
-extern __inline__ void act2000_schedule_tx(act2000_card *card)
+static inline void act2000_schedule_tx(act2000_card *card)
 {
         schedule_work(&card->snd_tq);
 }
 
-extern __inline__ void act2000_schedule_rx(act2000_card *card)
+static inline void act2000_schedule_rx(act2000_card *card)
 {
         schedule_work(&card->rcv_tq);
 }
 
-extern __inline__ void act2000_schedule_poll(act2000_card *card)
+static inline void act2000_schedule_poll(act2000_card *card)
 {
         schedule_work(&card->poll_tq);
 }
diff --git a/drivers/isdn/act2000/capi.h b/drivers/isdn/act2000/capi.h
index f6d5f530b8..e82a9289ad 100644
--- a/drivers/isdn/act2000/capi.h
+++ b/drivers/isdn/act2000/capi.h
@@ -330,7 +330,7 @@ typedef struct actcapi_msg {
 	} msg;
 } actcapi_msg;
 
-extern __inline__ unsigned short
+static inline unsigned short
 actcapi_nextsmsg(act2000_card *card)
 {
 	unsigned long flags;
diff --git a/drivers/isdn/sc/command.c b/drivers/isdn/sc/command.c
index 19f2fcf0ae..b4b24335f7 100644
--- a/drivers/isdn/sc/command.c
+++ b/drivers/isdn/sc/command.c
@@ -43,7 +43,6 @@ extern int send_and_receive(int, unsigned int, unsigned char, unsigned char,
                 RspMessage *, int);
 extern int sendmessage(int, unsigned int, unsigned int, unsigned int,
                 unsigned int, unsigned int, unsigned int, unsigned int *);
-extern inline void pullphone(char *, char *);
 
 #ifdef DEBUG
 /*
-- 
cgit v1.2.2


From 97a41e26124330e41aa10ef88cd1711bc3d17460 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Sun, 8 Jan 2006 01:02:17 -0800
Subject: [PATCH] kernel/: small cleanups

This patch contains the following cleanups:
- make needlessly global functions static
- every file should include the headers containing the prototypes for
  it's global functions

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Acked-by: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/audit.c      | 2 +-
 kernel/irq/proc.c   | 2 ++
 kernel/rcutorture.c | 2 +-
 kernel/timer.c      | 1 +
 4 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/kernel/audit.c b/kernel/audit.c
index 32fa03ad19..d13ab7d2d8 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -267,7 +267,7 @@ static int audit_set_failure(int state, uid_t loginuid)
 	return old;
 }
 
-int kauditd_thread(void *dummy)
+static int kauditd_thread(void *dummy)
 {
 	struct sk_buff *skb;
 
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 8a64a4844c..d03b5eef8c 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -10,6 +10,8 @@
 #include <linux/proc_fs.h>
 #include <linux/interrupt.h>
 
+#include "internals.h"
+
 static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS];
 
 #ifdef CONFIG_SMP
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 49fbbeff20..36efe088ad 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -103,7 +103,7 @@ atomic_t n_rcu_torture_error;
 /*
  * Allocate an element from the rcu_tortures pool.
  */
-struct rcu_torture *
+static struct rcu_torture *
 rcu_torture_alloc(void)
 {
 	struct list_head *p;
diff --git a/kernel/timer.c b/kernel/timer.c
index fd74268d86..074b4bd5cf 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -33,6 +33,7 @@
 #include <linux/posix-timers.h>
 #include <linux/cpu.h>
 #include <linux/syscalls.h>
+#include <linux/delay.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
-- 
cgit v1.2.2


From 8382bf2e72d16d0532e351299121ccd3bca0fdd8 Mon Sep 17 00:00:00 2001
From: Peter Osterlund <petero2@telia.com>
Date: Sun, 8 Jan 2006 01:02:17 -0800
Subject: [PATCH] pktcdvd: Use bd_claim to get exclusive access

Use bd_claim() when opening the cdrom device to prevent user space programs
such as cdrecord, hald and kded from interfering with the burning process.

Signed-off-by: Peter Osterlund <petero2@telia.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/block/pktcdvd.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index c0233efabe..51b7a5c5b7 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -1955,9 +1955,12 @@ static int pkt_open_dev(struct pktcdvd_device *pd, int write)
 	if ((ret = blkdev_get(pd->bdev, FMODE_READ, O_RDONLY)))
 		goto out;
 
+	if ((ret = bd_claim(pd->bdev, pd)))
+		goto out_putdev;
+
 	if ((ret = pkt_get_last_written(pd, &lba))) {
 		printk("pktcdvd: pkt_get_last_written failed\n");
-		goto out_putdev;
+		goto out_unclaim;
 	}
 
 	set_capacity(pd->disk, lba << 2);
@@ -1967,7 +1970,7 @@ static int pkt_open_dev(struct pktcdvd_device *pd, int write)
 	q = bdev_get_queue(pd->bdev);
 	if (write) {
 		if ((ret = pkt_open_write(pd)))
-			goto out_putdev;
+			goto out_unclaim;
 		/*
 		 * Some CDRW drives can not handle writes larger than one packet,
 		 * even if the size is a multiple of the packet size.
@@ -1982,13 +1985,15 @@ static int pkt_open_dev(struct pktcdvd_device *pd, int write)
 	}
 
 	if ((ret = pkt_set_segment_merging(pd, q)))
-		goto out_putdev;
+		goto out_unclaim;
 
 	if (write)
 		printk("pktcdvd: %lukB available on disc\n", lba << 1);
 
 	return 0;
 
+out_unclaim:
+	bd_release(pd->bdev);
 out_putdev:
 	blkdev_put(pd->bdev);
 out:
@@ -2007,6 +2012,7 @@ static void pkt_release_dev(struct pktcdvd_device *pd, int flush)
 	pkt_lock_door(pd, 0);
 
 	pkt_set_speed(pd, MAX_SPEED, MAX_SPEED);
+	bd_release(pd->bdev);
 	blkdev_put(pd->bdev);
 }
 
-- 
cgit v1.2.2


From a57004e1afb6ee03c509f1b1ec74a000682ab93b Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sun, 8 Jan 2006 01:02:19 -0800
Subject: [PATCH] atomic: dec_and_lock use atomic primitives

Convert atomic_dec_and_lock to use new atomic primitives.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 lib/dec_and_lock.c | 49 ++++++-------------------------------------------
 1 file changed, 6 insertions(+), 43 deletions(-)

diff --git a/lib/dec_and_lock.c b/lib/dec_and_lock.c
index 305a9663ae..a65c314555 100644
--- a/lib/dec_and_lock.c
+++ b/lib/dec_and_lock.c
@@ -1,47 +1,11 @@
 #include <linux/module.h>
 #include <linux/spinlock.h>
 #include <asm/atomic.h>
-#include <asm/system.h>
 
-#ifdef __HAVE_ARCH_CMPXCHG
 /*
  * This is an implementation of the notion of "decrement a
  * reference count, and return locked if it decremented to zero".
  *
- * This implementation can be used on any architecture that
- * has a cmpxchg, and where atomic->value is an int holding
- * the value of the atomic (i.e. the high bits aren't used
- * for a lock or anything like that).
- */
-int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
-{
-	int counter;
-	int newcount;
-
-	for (;;) {
-		counter = atomic_read(atomic);
-		newcount = counter - 1;
-		if (!newcount)
-			break;		/* do it the slow way */
-
-		newcount = cmpxchg(&atomic->counter, counter, newcount);
-		if (newcount == counter)
-			return 0;
-	}
-
-	spin_lock(lock);
-	if (atomic_dec_and_test(atomic))
-		return 1;
-	spin_unlock(lock);
-	return 0;
-}
-#else
-/*
- * This is an architecture-neutral, but slow,
- * implementation of the notion of "decrement
- * a reference count, and return locked if it
- * decremented to zero".
- *
  * NOTE NOTE NOTE! This is _not_ equivalent to
  *
  *	if (atomic_dec_and_test(&atomic)) {
@@ -52,21 +16,20 @@ int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
  *
  * because the spin-lock and the decrement must be
  * "atomic".
- *
- * This slow version gets the spinlock unconditionally,
- * and releases it if it isn't needed. Architectures
- * are encouraged to come up with better approaches,
- * this is trivially done efficiently using a load-locked
- * store-conditional approach, for example.
  */
 int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
 {
+#ifdef CONFIG_SMP
+	/* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
+	if (atomic_add_unless(atomic, -1, 1))
+		return 0;
+#endif
+	/* Otherwise do it the slow way */
 	spin_lock(lock);
 	if (atomic_dec_and_test(atomic))
 		return 1;
 	spin_unlock(lock);
 	return 0;
 }
-#endif
 
 EXPORT_SYMBOL(_atomic_dec_and_lock);
-- 
cgit v1.2.2


From 095975da26dba21698582e91e96be10f7417333f Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sun, 8 Jan 2006 01:02:19 -0800
Subject: [PATCH] rcu file: use atomic primitives

Use atomic_inc_not_zero for rcu files instead of special case rcuref.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/RCU/rcuref.txt |  87 ++++++++---------
 fs/aio.c                     |   3 +-
 fs/file_table.c              |   8 +-
 include/linux/fs.h           |   3 +-
 include/linux/radix-tree.h   |   1 +
 include/linux/rcuref.h       | 220 -------------------------------------------
 kernel/rcupdate.c            |  14 ---
 kernel/rcutorture.c          |   1 -
 security/selinux/hooks.c     |   2 +-
 9 files changed, 48 insertions(+), 291 deletions(-)
 delete mode 100644 include/linux/rcuref.h

diff --git a/Documentation/RCU/rcuref.txt b/Documentation/RCU/rcuref.txt
index a23fee6606..3f60db41b2 100644
--- a/Documentation/RCU/rcuref.txt
+++ b/Documentation/RCU/rcuref.txt
@@ -1,74 +1,67 @@
-Refcounter framework for elements of lists/arrays protected by
-RCU.
+Refcounter design for elements of lists/arrays protected by RCU.
 
 Refcounting on elements of  lists which are protected by traditional
 reader/writer spinlocks or semaphores are straight forward as in:
 
-1.					2.
-add()					search_and_reference()
-{					{
-	alloc_object				read_lock(&list_lock);
-	...					search_for_element
-	atomic_set(&el->rc, 1);			atomic_inc(&el->rc);
-	write_lock(&list_lock);			...
-	add_element				read_unlock(&list_lock);
-	...					...
-	write_unlock(&list_lock);	}
+1.				2.
+add()				search_and_reference()
+{				{
+    alloc_object		    read_lock(&list_lock);
+    ...				    search_for_element
+    atomic_set(&el->rc, 1);	    atomic_inc(&el->rc);
+    write_lock(&list_lock);	     ...
+    add_element			    read_unlock(&list_lock);
+    ...				    ...
+    write_unlock(&list_lock);	}
 }
 
 3.					4.
 release_referenced()			delete()
 {					{
-	...				write_lock(&list_lock);
-	atomic_dec(&el->rc, relfunc)	...
-	...				delete_element
-}					write_unlock(&list_lock);
- 					...
-					if (atomic_dec_and_test(&el->rc))
-						kfree(el);
-					...
+    ...					    write_lock(&list_lock);
+    atomic_dec(&el->rc, relfunc)	    ...
+    ...					    delete_element
+}					    write_unlock(&list_lock);
+ 					    ...
+					    if (atomic_dec_and_test(&el->rc))
+					        kfree(el);
+					    ...
 					}
 
 If this list/array is made lock free using rcu as in changing the
 write_lock in add() and delete() to spin_lock and changing read_lock
-in search_and_reference to rcu_read_lock(), the rcuref_get in
+in search_and_reference to rcu_read_lock(), the atomic_get in
 search_and_reference could potentially hold reference to an element which
-has already been deleted from the list/array.  rcuref_lf_get_rcu takes
+has already been deleted from the list/array.  atomic_inc_not_zero takes
 care of this scenario. search_and_reference should look as;
 
 1.					2.
 add()					search_and_reference()
 {					{
- 	alloc_object				rcu_read_lock();
-	...					search_for_element
-	atomic_set(&el->rc, 1);			if (rcuref_inc_lf(&el->rc)) {
-	write_lock(&list_lock);				rcu_read_unlock();
-							return FAIL;
-	add_element				}
-	...					...
-	write_unlock(&list_lock);		rcu_read_unlock();
+    alloc_object			    rcu_read_lock();
+    ...					    search_for_element
+    atomic_set(&el->rc, 1);		    if (atomic_inc_not_zero(&el->rc)) {
+    write_lock(&list_lock);		        rcu_read_unlock();
+					        return FAIL;
+    add_element				    }
+    ...					    ...
+    write_unlock(&list_lock);		    rcu_read_unlock();
 }					}
 3.					4.
 release_referenced()			delete()
 {					{
-	...				write_lock(&list_lock);
-	rcuref_dec(&el->rc, relfunc)	...
-	...				delete_element
-}					write_unlock(&list_lock);
- 					...
-					if (rcuref_dec_and_test(&el->rc))
-						call_rcu(&el->head, el_free);
-					...
+    ...					    write_lock(&list_lock);
+    atomic_dec(&el->rc, relfunc)	    ...
+    ...					    delete_element
+}					    write_unlock(&list_lock);
+ 					    ...
+					    if (atomic_dec_and_test(&el->rc))
+					        call_rcu(&el->head, el_free);
+					    ...
 					}
 
 Sometimes, reference to the element need to be obtained in the
-update (write) stream.  In such cases, rcuref_inc_lf might be an overkill
-since the spinlock serialising list updates are held. rcuref_inc
+update (write) stream.  In such cases, atomic_inc_not_zero might be an
+overkill since the spinlock serialising list updates are held. atomic_inc
 is to be used in such cases.
-For arches which do not have cmpxchg rcuref_inc_lf
-api uses a hashed spinlock implementation and the same hashed spinlock
-is acquired in all rcuref_xxx primitives to preserve atomicity.
-Note: Use rcuref_inc api only if you need to use rcuref_inc_lf on the
-refcounter atleast at one place.  Mixing rcuref_inc and atomic_xxx api
-might lead to races. rcuref_inc_lf() must be used in lockfree
-RCU critical sections only.
+
diff --git a/fs/aio.c b/fs/aio.c
index 5a28b69ad2..aec2b1916d 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -29,7 +29,6 @@
 #include <linux/highmem.h>
 #include <linux/workqueue.h>
 #include <linux/security.h>
-#include <linux/rcuref.h>
 
 #include <asm/kmap_types.h>
 #include <asm/uaccess.h>
@@ -514,7 +513,7 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
 	/* Must be done under the lock to serialise against cancellation.
 	 * Call this aio_fput as it duplicates fput via the fput_work.
 	 */
-	if (unlikely(rcuref_dec_and_test(&req->ki_filp->f_count))) {
+	if (unlikely(atomic_dec_and_test(&req->ki_filp->f_count))) {
 		get_ioctx(ctx);
 		spin_lock(&fput_lock);
 		list_add(&req->ki_list, &fput_head);
diff --git a/fs/file_table.c b/fs/file_table.c
index c3a5e2fd66..6142250104 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -117,7 +117,7 @@ EXPORT_SYMBOL(get_empty_filp);
 
 void fastcall fput(struct file *file)
 {
-	if (rcuref_dec_and_test(&file->f_count))
+	if (atomic_dec_and_test(&file->f_count))
 		__fput(file);
 }
 
@@ -166,7 +166,7 @@ struct file fastcall *fget(unsigned int fd)
 	rcu_read_lock();
 	file = fcheck_files(files, fd);
 	if (file) {
-		if (!rcuref_inc_lf(&file->f_count)) {
+		if (!atomic_inc_not_zero(&file->f_count)) {
 			/* File object ref couldn't be taken */
 			rcu_read_unlock();
 			return NULL;
@@ -198,7 +198,7 @@ struct file fastcall *fget_light(unsigned int fd, int *fput_needed)
 		rcu_read_lock();
 		file = fcheck_files(files, fd);
 		if (file) {
-			if (rcuref_inc_lf(&file->f_count))
+			if (atomic_inc_not_zero(&file->f_count))
 				*fput_needed = 1;
 			else
 				/* Didn't get the reference, someone's freed */
@@ -213,7 +213,7 @@ struct file fastcall *fget_light(unsigned int fd, int *fput_needed)
 
 void put_filp(struct file *file)
 {
-	if (rcuref_dec_and_test(&file->f_count)) {
+	if (atomic_dec_and_test(&file->f_count)) {
 		security_file_free(file);
 		file_kill(file);
 		file_free(file);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2c9c48d656..ef29500b5d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -9,7 +9,6 @@
 #include <linux/config.h>
 #include <linux/limits.h>
 #include <linux/ioctl.h>
-#include <linux/rcuref.h>
 
 /*
  * It's silly to have NR_OPEN bigger than NR_FILE, but you can change
@@ -653,7 +652,7 @@ extern spinlock_t files_lock;
 #define file_list_lock() spin_lock(&files_lock);
 #define file_list_unlock() spin_unlock(&files_lock);
 
-#define get_file(x)	rcuref_inc(&(x)->f_count)
+#define get_file(x)	atomic_inc(&(x)->f_count)
 #define file_count(x)	atomic_read(&(x)->f_count)
 
 #define	MAX_NON_LFS	((1UL<<31) - 1)
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 36e5d26961..c57ff2fcb3 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -19,6 +19,7 @@
 #ifndef _LINUX_RADIX_TREE_H
 #define _LINUX_RADIX_TREE_H
 
+#include <linux/sched.h>
 #include <linux/preempt.h>
 #include <linux/types.h>
 
diff --git a/include/linux/rcuref.h b/include/linux/rcuref.h
deleted file mode 100644
index e1adbba14b..0000000000
--- a/include/linux/rcuref.h
+++ /dev/null
@@ -1,220 +0,0 @@
-/*
- * rcuref.h
- *
- * Reference counting for elements of lists/arrays protected by
- * RCU.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- * Copyright (C) IBM Corporation, 2005
- *
- * Author: Dipankar Sarma <dipankar@in.ibm.com>
- *	   Ravikiran Thirumalai <kiran_th@gmail.com>
- *
- * See Documentation/RCU/rcuref.txt for detailed user guide.
- *
- */
-
-#ifndef _RCUREF_H_
-#define _RCUREF_H_
-
-#ifdef __KERNEL__
-
-#include <linux/types.h>
-#include <linux/interrupt.h>
-#include <linux/spinlock.h>
-#include <asm/atomic.h>
-
-/*
- * These APIs work on traditional atomic_t counters used in the
- * kernel for reference counting. Under special circumstances
- * where a lock-free get() operation races with a put() operation
- * these APIs can be used. See Documentation/RCU/rcuref.txt.
- */
-
-#ifdef __HAVE_ARCH_CMPXCHG
-
-/**
- * rcuref_inc - increment refcount for object.
- * @rcuref: reference counter in the object in question.
- *
- * This should be used only for objects where we use RCU and
- * use the rcuref_inc_lf() api to acquire a reference
- * in a lock-free reader-side critical section.
- */
-static inline void rcuref_inc(atomic_t *rcuref)
-{
-	atomic_inc(rcuref);
-}
-
-/**
- * rcuref_dec - decrement refcount for object.
- * @rcuref: reference counter in the object in question.
- *
- * This should be used only for objects where we use RCU and
- * use the rcuref_inc_lf() api to acquire a reference
- * in a lock-free reader-side critical section.
- */
-static inline void rcuref_dec(atomic_t *rcuref)
-{
-	atomic_dec(rcuref);
-}
-
-/**
- * rcuref_dec_and_test - decrement refcount for object and test
- * @rcuref: reference counter in the object.
- * @release: pointer to the function that will clean up the object
- *	     when the last reference to the object is released.
- *	     This pointer is required.
- *
- * Decrement the refcount, and if 0, return 1. Else return 0.
- *
- * This should be used only for objects where we use RCU and
- * use the rcuref_inc_lf() api to acquire a reference
- * in a lock-free reader-side critical section.
- */
-static inline int rcuref_dec_and_test(atomic_t *rcuref)
-{
-	return atomic_dec_and_test(rcuref);
-}
-
-/*
- * cmpxchg is needed on UP too, if deletions to the list/array can happen
- * in interrupt context.
- */
-
-/**
- * rcuref_inc_lf - Take reference to an object in a read-side
- * critical section protected by RCU.
- * @rcuref: reference counter in the object in question.
- *
- * Try and increment the refcount by 1.  The increment might fail if
- * the reference counter has been through a 1 to 0 transition and
- * is no longer part of the lock-free list.
- * Returns non-zero on successful increment and zero otherwise.
- */
-static inline int rcuref_inc_lf(atomic_t *rcuref)
-{
-	int c, old;
-	c = atomic_read(rcuref);
-	while (c && (old = cmpxchg(&rcuref->counter, c, c + 1)) != c)
-		c = old;
-	return c;
-}
-
-#else				/* !__HAVE_ARCH_CMPXCHG */
-
-extern spinlock_t __rcuref_hash[];
-
-/*
- * Use a hash table of locks to protect the reference count
- * since cmpxchg is not available in this arch.
- */
-#ifdef	CONFIG_SMP
-#define RCUREF_HASH_SIZE	4
-#define RCUREF_HASH(k) \
-	(&__rcuref_hash[(((unsigned long)k)>>8) & (RCUREF_HASH_SIZE-1)])
-#else
-#define	RCUREF_HASH_SIZE	1
-#define RCUREF_HASH(k) 	&__rcuref_hash[0]
-#endif				/* CONFIG_SMP */
-
-/**
- * rcuref_inc - increment refcount for object.
- * @rcuref: reference counter in the object in question.
- *
- * This should be used only for objects where we use RCU and
- * use the rcuref_inc_lf() api to acquire a reference in a lock-free
- * reader-side critical section.
- */
-static inline void rcuref_inc(atomic_t *rcuref)
-{
-	unsigned long flags;
-	spin_lock_irqsave(RCUREF_HASH(rcuref), flags);
-	rcuref->counter += 1;
-	spin_unlock_irqrestore(RCUREF_HASH(rcuref), flags);
-}
-
-/**
- * rcuref_dec - decrement refcount for object.
- * @rcuref: reference counter in the object in question.
- *
- * This should be used only for objects where we use RCU and
- * use the rcuref_inc_lf() api to acquire a reference in a lock-free
- * reader-side critical section.
- */
-static inline void rcuref_dec(atomic_t *rcuref)
-{
-	unsigned long flags;
-	spin_lock_irqsave(RCUREF_HASH(rcuref), flags);
-	rcuref->counter -= 1;
-	spin_unlock_irqrestore(RCUREF_HASH(rcuref), flags);
-}
-
-/**
- * rcuref_dec_and_test - decrement refcount for object and test
- * @rcuref: reference counter in the object.
- * @release: pointer to the function that will clean up the object
- *	     when the last reference to the object is released.
- *	     This pointer is required.
- *
- * Decrement the refcount, and if 0, return 1. Else return 0.
- *
- * This should be used only for objects where we use RCU and
- * use the rcuref_inc_lf() api to acquire a reference in a lock-free
- * reader-side critical section.
- */
-static inline int rcuref_dec_and_test(atomic_t *rcuref)
-{
-	unsigned long flags;
-	spin_lock_irqsave(RCUREF_HASH(rcuref), flags);
-	rcuref->counter--;
-	if (!rcuref->counter) {
-		spin_unlock_irqrestore(RCUREF_HASH(rcuref), flags);
-		return 1;
-	} else {
-		spin_unlock_irqrestore(RCUREF_HASH(rcuref), flags);
-		return 0;
-	}
-}
-
-/**
- * rcuref_inc_lf - Take reference to an object of a lock-free collection
- * by traversing a lock-free list/array.
- * @rcuref: reference counter in the object in question.
- *
- * Try and increment the refcount by 1.  The increment might fail if
- * the reference counter has been through a 1 to 0 transition and
- * object is no longer part of the lock-free list.
- * Returns non-zero on successful increment and zero otherwise.
- */
-static inline int rcuref_inc_lf(atomic_t *rcuref)
-{
-	int ret;
-	unsigned long flags;
-	spin_lock_irqsave(RCUREF_HASH(rcuref), flags);
-	if (rcuref->counter)
-		ret = rcuref->counter++;
-	else
-		ret = 0;
-	spin_unlock_irqrestore(RCUREF_HASH(rcuref), flags);
-	return ret;
-}
-
-
-#endif /* !__HAVE_ARCH_CMPXCHG */
-
-#endif /* __KERNEL__ */
-#endif /* _RCUREF_H_ */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 0a669bd2f6..30b0bba038 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -46,7 +46,6 @@
 #include <linux/percpu.h>
 #include <linux/notifier.h>
 #include <linux/rcupdate.h>
-#include <linux/rcuref.h>
 #include <linux/cpu.h>
 
 /* Definition for rcupdate control block. */
@@ -74,19 +73,6 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
 static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
 static int maxbatch = 10000;
 
-#ifndef __HAVE_ARCH_CMPXCHG
-/*
- * We use an array of spinlocks for the rcurefs -- similar to ones in sparc
- * 32 bit atomic_t implementations, and a hash function similar to that
- * for our refcounting needs.
- * Can't help multiprocessors which donot have cmpxchg :(
- */
-
-spinlock_t __rcuref_hash[RCUREF_HASH_SIZE] = {
-	[0 ... (RCUREF_HASH_SIZE-1)] = SPIN_LOCK_UNLOCKED
-};
-#endif
-
 /**
  * call_rcu - Queue an RCU callback for invocation after a grace period.
  * @head: structure to be used for queueing the RCU updates.
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 36efe088ad..75174c8152 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -39,7 +39,6 @@
 #include <linux/moduleparam.h>
 #include <linux/percpu.h>
 #include <linux/notifier.h>
-#include <linux/rcuref.h>
 #include <linux/cpu.h>
 #include <linux/random.h>
 #include <linux/delay.h>
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 3d496eae1b..6647204e46 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1663,7 +1663,7 @@ static inline void flush_unauthorized_files(struct files_struct * files)
 						continue;
 					}
 					if (devnull) {
-						rcuref_inc(&devnull->f_count);
+						get_file(devnull);
 					} else {
 						devnull = dentry_open(dget(selinux_null), mntget(selinuxfs_mount), O_RDWR);
 						if (!devnull) {
-- 
cgit v1.2.2


From b3f3d6141f8636f627bf19fd44eaf59a52637ac8 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Sun, 8 Jan 2006 01:02:20 -0800
Subject: [PATCH] ELF: symbol table type additions

Needed for the Novell kernel debugger and perhaps some per-cpu data on x86_64
in the future.

Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/elf.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/linux/elf.h b/include/linux/elf.h
index ff955dbf51..d3bfacb244 100644
--- a/include/linux/elf.h
+++ b/include/linux/elf.h
@@ -151,6 +151,8 @@ typedef __s64	Elf64_Sxword;
 #define STT_FUNC    2
 #define STT_SECTION 3
 #define STT_FILE    4
+#define STT_COMMON  5
+#define STT_TLS     6
 
 #define ELF_ST_BIND(x)		((x) >> 4)
 #define ELF_ST_TYPE(x)		(((unsigned int) x) & 0xf)
-- 
cgit v1.2.2


From b33291c0bcecfa44baa905964eec4b8815dcbcdf Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 8 Jan 2006 01:02:21 -0800
Subject: [PATCH] ipc: expand shm_flags

Unobfsucate this struct member

Cc: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 ipc/shm.c | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/ipc/shm.c b/ipc/shm.c
index 0ef4a1cf3e..0b92e874fc 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -34,8 +34,6 @@
 
 #include "util.h"
 
-#define shm_flags	shm_perm.mode
-
 static struct file_operations shm_file_operations;
 static struct vm_operations_struct shm_vm_ops;
 
@@ -148,7 +146,7 @@ static void shm_close (struct vm_area_struct *shmd)
 	shp->shm_dtim = get_seconds();
 	shp->shm_nattch--;
 	if(shp->shm_nattch == 0 &&
-	   shp->shm_flags & SHM_DEST)
+	   shp->shm_perm.mode & SHM_DEST)
 		shm_destroy (shp);
 	else
 		shm_unlock(shp);
@@ -205,7 +203,7 @@ static int newseg (key_t key, int shmflg, size_t size)
 		return -ENOMEM;
 
 	shp->shm_perm.key = key;
-	shp->shm_flags = (shmflg & S_IRWXUGO);
+	shp->shm_perm.mode = (shmflg & S_IRWXUGO);
 	shp->mlock_user = NULL;
 
 	shp->shm_perm.security = NULL;
@@ -345,7 +343,7 @@ static inline unsigned long copy_shmid_from_user(struct shm_setbuf *out, void __
 
 		out->uid	= tbuf.shm_perm.uid;
 		out->gid	= tbuf.shm_perm.gid;
-		out->mode	= tbuf.shm_flags;
+		out->mode	= tbuf.shm_perm.mode;
 
 		return 0;
 	    }
@@ -358,7 +356,7 @@ static inline unsigned long copy_shmid_from_user(struct shm_setbuf *out, void __
 
 		out->uid	= tbuf_old.shm_perm.uid;
 		out->gid	= tbuf_old.shm_perm.gid;
-		out->mode	= tbuf_old.shm_flags;
+		out->mode	= tbuf_old.shm_perm.mode;
 
 		return 0;
 	    }
@@ -560,13 +558,13 @@ asmlinkage long sys_shmctl (int shmid, int cmd, struct shmid_ds __user *buf)
 			if (!is_file_hugepages(shp->shm_file)) {
 				err = shmem_lock(shp->shm_file, 1, user);
 				if (!err) {
-					shp->shm_flags |= SHM_LOCKED;
+					shp->shm_perm.mode |= SHM_LOCKED;
 					shp->mlock_user = user;
 				}
 			}
 		} else if (!is_file_hugepages(shp->shm_file)) {
 			shmem_lock(shp->shm_file, 0, shp->mlock_user);
-			shp->shm_flags &= ~SHM_LOCKED;
+			shp->shm_perm.mode &= ~SHM_LOCKED;
 			shp->mlock_user = NULL;
 		}
 		shm_unlock(shp);
@@ -605,7 +603,7 @@ asmlinkage long sys_shmctl (int shmid, int cmd, struct shmid_ds __user *buf)
 			goto out_unlock_up;
 
 		if (shp->shm_nattch){
-			shp->shm_flags |= SHM_DEST;
+			shp->shm_perm.mode |= SHM_DEST;
 			/* Do not find it any more */
 			shp->shm_perm.key = IPC_PRIVATE;
 			shm_unlock(shp);
@@ -644,7 +642,7 @@ asmlinkage long sys_shmctl (int shmid, int cmd, struct shmid_ds __user *buf)
 		
 		shp->shm_perm.uid = setbuf.uid;
 		shp->shm_perm.gid = setbuf.gid;
-		shp->shm_flags = (shp->shm_flags & ~S_IRWXUGO)
+		shp->shm_perm.mode = (shp->shm_perm.mode & ~S_IRWXUGO)
 			| (setbuf.mode & S_IRWXUGO);
 		shp->shm_ctim = get_seconds();
 		break;
@@ -777,7 +775,7 @@ invalid:
 		BUG();
 	shp->shm_nattch--;
 	if(shp->shm_nattch == 0 &&
-	   shp->shm_flags & SHM_DEST)
+	   shp->shm_perm.mode & SHM_DEST)
 		shm_destroy (shp);
 	else
 		shm_unlock(shp);
@@ -902,7 +900,7 @@ static int sysvipc_shm_proc_show(struct seq_file *s, void *it)
 	return seq_printf(s, format,
 			  shp->shm_perm.key,
 			  shp->id,
-			  shp->shm_flags,
+			  shp->shm_perm.mode,
 			  shp->shm_segsz,
 			  shp->shm_cprid,
 			  shp->shm_lprid,
-- 
cgit v1.2.2


From 6625b861f8f0e429902b8671b3e70792cd99074e Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@us.ibm.com>
Date: Sun, 8 Jan 2006 01:02:23 -0800
Subject: [PATCH] relayfs: decouple buffer creation from inode creation

The patch series implementa or fixes 3 things that were specifically requested
or suggested by relayfs users:

- support for non-relay files (patches 1-6)

Currently, the relayfs API only supports the creation of directories
(relayfs_create_dir()) and relay files (relay_open()).  These patches adds
support for non-relay files (relayfs_create_file()).  This is so relayfs
applications can create 'control files' in relayfs itself rather than in /proc
or via a netlink channel, as is currently done in the relay-app examples.
Basically what this amounts to is exporting relayfs_create_file() with an
additional file_ops param that clients can use to supply file operations for
their own special-purpose files in relayfs.

- make exported relay file ops useful (patches 7-8)

The relayfs relay_file_operations have always been exported, the intent being
to make it possible to create relay files in other filesystems such as
debugfs.  The problem, though, is that currently the file operations are too
tightly coupled to relayfs to actually be used for this purpose.  This patch
fixes that by adding a couple of callback functions that allow a client to
hook into relay_open()/close() and supply the files that will be used to
represent the channel buffers; the default implementation if no callbacks are
defined is to create the files in relayfs.

- add an option to create global relay buffer (patches 9-10) The file creation
callback also supplies an optional param, is_global, that can be used by
clients to create a single global relayfs buffer instead of the default
per-cpu buffers.  This was suggested as being useful for certain debugging
applications where it's more convenient to be able to get all the data from a
single channel without having to go to the bother of dealing with per-cpu
files.

- cleanup, some renaming and Documentation updates (patches 11-12)

There were several comments that the use of netlink in the example code was
non-intuitive and in fact the whole relay-app business was needlessly
confusing.  Based on that feedback, the example code has been completely
converted over to relayfs control files as supported by this patch, and have
also been made completely self-contained.

The converted examples along with a couple of new examples that demonstrate
using exported relay files can be found in relay-apps tarball:
http://prdownloads.sourceforge.net/relayfs/relay-apps-0.9.tar.gz?download

This patch:

Separate buffer create/destroy from inode create/destroy.  We want to be able
to associate other data and not just relay buffers with inodes.  Buffer
create/destroy is moved out of inode.c and into relayfs core code.

Signed-off-by: Tom Zanussi <zanussi@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/relayfs/buffers.c |  1 +
 fs/relayfs/inode.c   | 31 +++++++++----------------------
 fs/relayfs/relay.c   | 11 ++++++++---
 fs/relayfs/relay.h   |  2 +-
 4 files changed, 19 insertions(+), 26 deletions(-)

diff --git a/fs/relayfs/buffers.c b/fs/relayfs/buffers.c
index 84e21ffa5c..667b529944 100644
--- a/fs/relayfs/buffers.c
+++ b/fs/relayfs/buffers.c
@@ -186,4 +186,5 @@ void relay_remove_buf(struct kref *kref)
 {
 	struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref);
 	relayfs_remove(buf->dentry);
+	relay_destroy_buf(buf);
 }
diff --git a/fs/relayfs/inode.c b/fs/relayfs/inode.c
index 0f7f88d067..379e07cd2b 100644
--- a/fs/relayfs/inode.c
+++ b/fs/relayfs/inode.c
@@ -34,23 +34,13 @@ static struct backing_dev_info		relayfs_backing_dev_info = {
 };
 
 static struct inode *relayfs_get_inode(struct super_block *sb, int mode,
-				       struct rchan *chan)
+				       void *data)
 {
-	struct rchan_buf *buf = NULL;
 	struct inode *inode;
 
-	if (S_ISREG(mode)) {
-		BUG_ON(!chan);
-		buf = relay_create_buf(chan);
-		if (!buf)
-			return NULL;
-	}
-
 	inode = new_inode(sb);
-	if (!inode) {
-		relay_destroy_buf(buf);
+	if (!inode)
 		return NULL;
-	}
 
 	inode->i_mode = mode;
 	inode->i_uid = 0;
@@ -62,7 +52,7 @@ static struct inode *relayfs_get_inode(struct super_block *sb, int mode,
 	switch (mode & S_IFMT) {
 	case S_IFREG:
 		inode->i_fop = &relayfs_file_operations;
-		RELAYFS_I(inode)->buf = buf;
+		RELAYFS_I(inode)->buf = data;
 		break;
 	case S_IFDIR:
 		inode->i_op = &simple_dir_inode_operations;
@@ -83,7 +73,7 @@ static struct inode *relayfs_get_inode(struct super_block *sb, int mode,
  *	@name: the name of the file to create
  *	@parent: parent directory
  *	@mode: mode
- *	@chan: relay channel associated with the file
+ *	@data: user-associated data for this file
  *
  *	Returns the new dentry, NULL on failure
  *
@@ -92,7 +82,7 @@ static struct inode *relayfs_get_inode(struct super_block *sb, int mode,
 static struct dentry *relayfs_create_entry(const char *name,
 					   struct dentry *parent,
 					   int mode,
-					   struct rchan *chan)
+					   void *data)
 {
 	struct dentry *d;
 	struct inode *inode;
@@ -127,7 +117,7 @@ static struct dentry *relayfs_create_entry(const char *name,
 		goto release_mount;
 	}
 
-	inode = relayfs_get_inode(parent->d_inode->i_sb, mode, chan);
+	inode = relayfs_get_inode(parent->d_inode->i_sb, mode, data);
 	if (!inode) {
 		d = NULL;
 		goto release_mount;
@@ -155,20 +145,20 @@ exit:
  *	@name: the name of the file to create
  *	@parent: parent directory
  *	@mode: mode, if not specied the default perms are used
- *	@chan: channel associated with the file
+ *	@data: user-associated data for this file
  *
  *	Returns file dentry if successful, NULL otherwise.
  *
  *	The file will be created user r on behalf of current user.
  */
 struct dentry *relayfs_create_file(const char *name, struct dentry *parent,
-				   int mode, struct rchan *chan)
+				   int mode, void *data)
 {
 	if (!mode)
 		mode = S_IRUSR;
 	mode = (mode & S_IALLUGO) | S_IFREG;
 
-	return relayfs_create_entry(name, parent, mode, chan);
+	return relayfs_create_entry(name, parent, mode, data);
 }
 
 /**
@@ -505,9 +495,6 @@ static struct inode *relayfs_alloc_inode(struct super_block *sb)
  */
 static void relayfs_destroy_inode(struct inode *inode)
 {
-	if (RELAYFS_I(inode)->buf)
-		relay_destroy_buf(RELAYFS_I(inode)->buf);
-
 	kmem_cache_free(relayfs_inode_cachep, RELAYFS_I(inode));
 }
 
diff --git a/fs/relayfs/relay.c b/fs/relayfs/relay.c
index 2a6f7f12b7..7fbda177ad 100644
--- a/fs/relayfs/relay.c
+++ b/fs/relayfs/relay.c
@@ -171,12 +171,17 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan,
 	struct rchan_buf *buf;
 	struct dentry *dentry;
 
+ 	buf = relay_create_buf(chan);
+ 	if (!buf)
+ 		return NULL;
+
 	/* Create file in fs */
-	dentry = relayfs_create_file(filename, parent, S_IRUSR, chan);
-	if (!dentry)
+	dentry = relayfs_create_file(filename, parent, S_IRUSR, buf);
+ 	if (!dentry) {
+ 		relay_destroy_buf(buf);
 		return NULL;
+ 	}
 
-	buf = RELAYFS_I(dentry->d_inode)->buf;
 	buf->dentry = dentry;
 	__relay_reset(buf, 1);
 
diff --git a/fs/relayfs/relay.h b/fs/relayfs/relay.h
index 703503fa22..c325bb2435 100644
--- a/fs/relayfs/relay.h
+++ b/fs/relayfs/relay.h
@@ -4,7 +4,7 @@
 struct dentry *relayfs_create_file(const char *name,
 				   struct dentry *parent,
 				   int mode,
-				   struct rchan *chan);
+				   void *data);
 extern int relayfs_remove(struct dentry *dentry);
 extern int relay_buf_empty(struct rchan_buf *buf);
 extern void relay_destroy_channel(struct kref *kref);
-- 
cgit v1.2.2


From 907f2c77d1653ce235e8e1fd6ce5c46005814e78 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@us.ibm.com>
Date: Sun, 8 Jan 2006 01:02:24 -0800
Subject: [PATCH] relayfs: export relayfs_create_file() with fileops param

This patch adds a mandatory fileops param to relayfs_create_file() and exports
that function so that clients can use it to create files defined by their own
set of file operations, in relayfs.  The purpose is to allow relayfs
applications to create their own set of 'control' files alongside their relay
files in relayfs rather than having to create them in /proc or debugfs for
instance.  relayfs_create_file() is also used by relay_open_buf() to create
the relay files for a channel.  In this case, a pointer to
relayfs_file_operations is passed in, along with a pointer to the buffer
associated with the file.

Signed-off-by: Tom Zanussi <zanussi@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/relayfs/inode.c         | 41 ++++++++++++++++++++++++++---------------
 fs/relayfs/relay.c         |  3 ++-
 fs/relayfs/relay.h         |  4 ----
 include/linux/relayfs_fs.h |  7 ++++++-
 4 files changed, 34 insertions(+), 21 deletions(-)

diff --git a/fs/relayfs/inode.c b/fs/relayfs/inode.c
index 379e07cd2b..a5e6d4f2ef 100644
--- a/fs/relayfs/inode.c
+++ b/fs/relayfs/inode.c
@@ -33,7 +33,9 @@ static struct backing_dev_info		relayfs_backing_dev_info = {
 	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
 };
 
-static struct inode *relayfs_get_inode(struct super_block *sb, int mode,
+static struct inode *relayfs_get_inode(struct super_block *sb,
+				       int mode,
+ 				       struct file_operations *fops,
 				       void *data)
 {
 	struct inode *inode;
@@ -51,8 +53,8 @@ static struct inode *relayfs_get_inode(struct super_block *sb, int mode,
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 	switch (mode & S_IFMT) {
 	case S_IFREG:
-		inode->i_fop = &relayfs_file_operations;
-		RELAYFS_I(inode)->buf = data;
+		inode->i_fop = fops;
+		RELAYFS_I(inode)->data = data;
 		break;
 	case S_IFDIR:
 		inode->i_op = &simple_dir_inode_operations;
@@ -73,6 +75,7 @@ static struct inode *relayfs_get_inode(struct super_block *sb, int mode,
  *	@name: the name of the file to create
  *	@parent: parent directory
  *	@mode: mode
+ *	@fops: file operations to use for the file
  *	@data: user-associated data for this file
  *
  *	Returns the new dentry, NULL on failure
@@ -82,6 +85,7 @@ static struct inode *relayfs_get_inode(struct super_block *sb, int mode,
 static struct dentry *relayfs_create_entry(const char *name,
 					   struct dentry *parent,
 					   int mode,
+					   struct file_operations *fops,
 					   void *data)
 {
 	struct dentry *d;
@@ -117,7 +121,7 @@ static struct dentry *relayfs_create_entry(const char *name,
 		goto release_mount;
 	}
 
-	inode = relayfs_get_inode(parent->d_inode->i_sb, mode, data);
+	inode = relayfs_get_inode(parent->d_inode->i_sb, mode, fops, data);
 	if (!inode) {
 		d = NULL;
 		goto release_mount;
@@ -145,20 +149,26 @@ exit:
  *	@name: the name of the file to create
  *	@parent: parent directory
  *	@mode: mode, if not specied the default perms are used
+ *	@fops: file operations to use for the file
  *	@data: user-associated data for this file
  *
  *	Returns file dentry if successful, NULL otherwise.
  *
  *	The file will be created user r on behalf of current user.
  */
-struct dentry *relayfs_create_file(const char *name, struct dentry *parent,
-				   int mode, void *data)
+struct dentry *relayfs_create_file(const char *name,
+				   struct dentry *parent,
+				   int mode,
+				   struct file_operations *fops,
+				   void *data)
 {
+	BUG_ON(!fops);
+
 	if (!mode)
 		mode = S_IRUSR;
 	mode = (mode & S_IALLUGO) | S_IFREG;
 
-	return relayfs_create_entry(name, parent, mode, data);
+	return relayfs_create_entry(name, parent, mode, fops, data);
 }
 
 /**
@@ -173,7 +183,7 @@ struct dentry *relayfs_create_file(const char *name, struct dentry *parent,
 struct dentry *relayfs_create_dir(const char *name, struct dentry *parent)
 {
 	int mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
-	return relayfs_create_entry(name, parent, mode, NULL);
+	return relayfs_create_entry(name, parent, mode, NULL, NULL);
 }
 
 /**
@@ -234,7 +244,7 @@ int relayfs_remove_dir(struct dentry *dentry)
  */
 static int relayfs_open(struct inode *inode, struct file *filp)
 {
-	struct rchan_buf *buf = RELAYFS_I(inode)->buf;
+	struct rchan_buf *buf = RELAYFS_I(inode)->data;
 	kref_get(&buf->kref);
 
 	return 0;
@@ -250,7 +260,7 @@ static int relayfs_open(struct inode *inode, struct file *filp)
 static int relayfs_mmap(struct file *filp, struct vm_area_struct *vma)
 {
 	struct inode *inode = filp->f_dentry->d_inode;
-	return relay_mmap_buf(RELAYFS_I(inode)->buf, vma);
+	return relay_mmap_buf(RELAYFS_I(inode)->data, vma);
 }
 
 /**
@@ -264,7 +274,7 @@ static unsigned int relayfs_poll(struct file *filp, poll_table *wait)
 {
 	unsigned int mask = 0;
 	struct inode *inode = filp->f_dentry->d_inode;
-	struct rchan_buf *buf = RELAYFS_I(inode)->buf;
+	struct rchan_buf *buf = RELAYFS_I(inode)->data;
 
 	if (buf->finalized)
 		return POLLERR;
@@ -288,7 +298,7 @@ static unsigned int relayfs_poll(struct file *filp, poll_table *wait)
  */
 static int relayfs_release(struct inode *inode, struct file *filp)
 {
-	struct rchan_buf *buf = RELAYFS_I(inode)->buf;
+	struct rchan_buf *buf = RELAYFS_I(inode)->data;
 	kref_put(&buf->kref, relay_remove_buf);
 
 	return 0;
@@ -450,7 +460,7 @@ static ssize_t relayfs_read(struct file *filp,
 			    loff_t *ppos)
 {
 	struct inode *inode = filp->f_dentry->d_inode;
-	struct rchan_buf *buf = RELAYFS_I(inode)->buf;
+	struct rchan_buf *buf = RELAYFS_I(inode)->data;
 	size_t read_start, avail;
 	ssize_t ret = 0;
 	void *from;
@@ -485,7 +495,7 @@ static struct inode *relayfs_alloc_inode(struct super_block *sb)
 	struct relayfs_inode_info *p = kmem_cache_alloc(relayfs_inode_cachep, SLAB_KERNEL);
 	if (!p)
 		return NULL;
-	p->buf = NULL;
+	p->data = NULL;
 
 	return &p->vfs_inode;
 }
@@ -531,7 +541,7 @@ static int relayfs_fill_super(struct super_block * sb, void * data, int silent)
 	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
 	sb->s_magic = RELAYFS_MAGIC;
 	sb->s_op = &relayfs_ops;
-	inode = relayfs_get_inode(sb, mode, NULL);
+	inode = relayfs_get_inode(sb, mode, NULL, NULL);
 
 	if (!inode)
 		return -ENOMEM;
@@ -589,6 +599,7 @@ module_exit(exit_relayfs_fs)
 EXPORT_SYMBOL_GPL(relayfs_file_operations);
 EXPORT_SYMBOL_GPL(relayfs_create_dir);
 EXPORT_SYMBOL_GPL(relayfs_remove_dir);
+EXPORT_SYMBOL_GPL(relayfs_create_file);
 
 MODULE_AUTHOR("Tom Zanussi <zanussi@us.ibm.com> and Karim Yaghmour <karim@opersys.com>");
 MODULE_DESCRIPTION("Relay Filesystem");
diff --git a/fs/relayfs/relay.c b/fs/relayfs/relay.c
index 7fbda177ad..a9cd5585c4 100644
--- a/fs/relayfs/relay.c
+++ b/fs/relayfs/relay.c
@@ -176,7 +176,8 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan,
  		return NULL;
 
 	/* Create file in fs */
-	dentry = relayfs_create_file(filename, parent, S_IRUSR, buf);
+	dentry = relayfs_create_file(filename, parent, S_IRUSR,
+				     &relayfs_file_operations, buf);
  	if (!dentry) {
  		relay_destroy_buf(buf);
 		return NULL;
diff --git a/fs/relayfs/relay.h b/fs/relayfs/relay.h
index c325bb2435..0993d3e575 100644
--- a/fs/relayfs/relay.h
+++ b/fs/relayfs/relay.h
@@ -1,10 +1,6 @@
 #ifndef _RELAY_H
 #define _RELAY_H
 
-struct dentry *relayfs_create_file(const char *name,
-				   struct dentry *parent,
-				   int mode,
-				   void *data);
 extern int relayfs_remove(struct dentry *dentry);
 extern int relay_buf_empty(struct rchan_buf *buf);
 extern void relay_destroy_channel(struct kref *kref);
diff --git a/include/linux/relayfs_fs.h b/include/linux/relayfs_fs.h
index fb7e807373..a122df2d98 100644
--- a/include/linux/relayfs_fs.h
+++ b/include/linux/relayfs_fs.h
@@ -70,7 +70,7 @@ struct rchan
 struct relayfs_inode_info
 {
 	struct inode vfs_inode;
-	struct rchan_buf *buf;
+	void *data;
 };
 
 static inline struct relayfs_inode_info *RELAYFS_I(struct inode *inode)
@@ -148,6 +148,11 @@ extern size_t relay_switch_subbuf(struct rchan_buf *buf,
 extern struct dentry *relayfs_create_dir(const char *name,
 					 struct dentry *parent);
 extern int relayfs_remove_dir(struct dentry *dentry);
+extern struct dentry *relayfs_create_file(const char *name,
+					  struct dentry *parent,
+					  int mode,
+					  struct file_operations *fops,
+					  void *data);
 
 /**
  *	relay_write - write data into the channel
-- 
cgit v1.2.2


From 7431733791feb0b19453d8047b0723c744667040 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@us.ibm.com>
Date: Sun, 8 Jan 2006 01:02:25 -0800
Subject: [PATCH] relayfs: add relayfs_remove_file()

This patch adds and exports relayfs_remove_file(), for API symmetry (with
relayfs_create_file()).

Signed-off-by: Tom Zanussi <zanussi@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/relayfs/inode.c         | 12 ++++++++++++
 include/linux/relayfs_fs.h |  1 +
 2 files changed, 13 insertions(+)

diff --git a/fs/relayfs/inode.c b/fs/relayfs/inode.c
index a5e6d4f2ef..b2f5065573 100644
--- a/fs/relayfs/inode.c
+++ b/fs/relayfs/inode.c
@@ -224,6 +224,17 @@ int relayfs_remove(struct dentry *dentry)
 	return error;
 }
 
+/**
+ *	relayfs_remove_file - remove a file from relay filesystem
+ *	@dentry: directory dentry
+ *
+ *	Returns 0 if successful, negative otherwise.
+ */
+int relayfs_remove_file(struct dentry *dentry)
+{
+	return relayfs_remove(dentry);
+}
+
 /**
  *	relayfs_remove_dir - remove a directory in the relay filesystem
  *	@dentry: directory dentry
@@ -600,6 +611,7 @@ EXPORT_SYMBOL_GPL(relayfs_file_operations);
 EXPORT_SYMBOL_GPL(relayfs_create_dir);
 EXPORT_SYMBOL_GPL(relayfs_remove_dir);
 EXPORT_SYMBOL_GPL(relayfs_create_file);
+EXPORT_SYMBOL_GPL(relayfs_remove_file);
 
 MODULE_AUTHOR("Tom Zanussi <zanussi@us.ibm.com> and Karim Yaghmour <karim@opersys.com>");
 MODULE_DESCRIPTION("Relay Filesystem");
diff --git a/include/linux/relayfs_fs.h b/include/linux/relayfs_fs.h
index a122df2d98..921540b1cd 100644
--- a/include/linux/relayfs_fs.h
+++ b/include/linux/relayfs_fs.h
@@ -153,6 +153,7 @@ extern struct dentry *relayfs_create_file(const char *name,
 					  int mode,
 					  struct file_operations *fops,
 					  void *data);
+extern int relayfs_remove_file(struct dentry *dentry);
 
 /**
  *	relay_write - write data into the channel
-- 
cgit v1.2.2


From 51008f9f95a4c3158151a75f88fb03fb0f646aba Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@us.ibm.com>
Date: Sun, 8 Jan 2006 01:02:26 -0800
Subject: [PATCH] relayfs: use generic_ip for private data

Use inode->u.generic_ip instead of relayfs_inode_info to store pointer to user
data.  Clients using relayfs_file_create() to create their own files would
probably more expect their data to be stored in generic_ip; we also intend in
the next set of patches to get rid of relayfs-specific stuff in the file
operations, so we might as well do it here.

Signed-off-by: Tom Zanussi <zanussi@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/relayfs/inode.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/fs/relayfs/inode.c b/fs/relayfs/inode.c
index b2f5065573..7f6d2c8e91 100644
--- a/fs/relayfs/inode.c
+++ b/fs/relayfs/inode.c
@@ -54,7 +54,8 @@ static struct inode *relayfs_get_inode(struct super_block *sb,
 	switch (mode & S_IFMT) {
 	case S_IFREG:
 		inode->i_fop = fops;
-		RELAYFS_I(inode)->data = data;
+		if (data)
+			inode->u.generic_ip = data;
 		break;
 	case S_IFDIR:
 		inode->i_op = &simple_dir_inode_operations;
@@ -255,8 +256,9 @@ int relayfs_remove_dir(struct dentry *dentry)
  */
 static int relayfs_open(struct inode *inode, struct file *filp)
 {
-	struct rchan_buf *buf = RELAYFS_I(inode)->data;
+	struct rchan_buf *buf = inode->u.generic_ip;
 	kref_get(&buf->kref);
+	filp->private_data = buf;
 
 	return 0;
 }
@@ -270,8 +272,8 @@ static int relayfs_open(struct inode *inode, struct file *filp)
  */
 static int relayfs_mmap(struct file *filp, struct vm_area_struct *vma)
 {
-	struct inode *inode = filp->f_dentry->d_inode;
-	return relay_mmap_buf(RELAYFS_I(inode)->data, vma);
+	struct rchan_buf *buf = filp->private_data;
+	return relay_mmap_buf(buf, vma);
 }
 
 /**
@@ -284,8 +286,7 @@ static int relayfs_mmap(struct file *filp, struct vm_area_struct *vma)
 static unsigned int relayfs_poll(struct file *filp, poll_table *wait)
 {
 	unsigned int mask = 0;
-	struct inode *inode = filp->f_dentry->d_inode;
-	struct rchan_buf *buf = RELAYFS_I(inode)->data;
+	struct rchan_buf *buf = filp->private_data;
 
 	if (buf->finalized)
 		return POLLERR;
@@ -309,7 +310,7 @@ static unsigned int relayfs_poll(struct file *filp, poll_table *wait)
  */
 static int relayfs_release(struct inode *inode, struct file *filp)
 {
-	struct rchan_buf *buf = RELAYFS_I(inode)->data;
+	struct rchan_buf *buf = filp->private_data;
 	kref_put(&buf->kref, relay_remove_buf);
 
 	return 0;
@@ -470,8 +471,8 @@ static ssize_t relayfs_read(struct file *filp,
 			    size_t count,
 			    loff_t *ppos)
 {
+	struct rchan_buf *buf = filp->private_data;
 	struct inode *inode = filp->f_dentry->d_inode;
-	struct rchan_buf *buf = RELAYFS_I(inode)->data;
 	size_t read_start, avail;
 	ssize_t ret = 0;
 	void *from;
-- 
cgit v1.2.2


From aaea25d7a68a7f72e167dc1698b66a15edc71883 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@us.ibm.com>
Date: Sun, 8 Jan 2006 01:02:26 -0800
Subject: [PATCH] relayfs: remove unused alloc/destroy_inode()

Since we're no longer using relayfs_inode_info, remove relayfs_alloc_inode()
and relayfs_destroy_inode() along with the relayfs inode cache.

Signed-off-by: Tom Zanussi <zanussi@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/relayfs/inode.c         | 46 +---------------------------------------------
 include/linux/relayfs_fs.h | 14 --------------
 2 files changed, 1 insertion(+), 59 deletions(-)

diff --git a/fs/relayfs/inode.c b/fs/relayfs/inode.c
index 7f6d2c8e91..b4c3e0466e 100644
--- a/fs/relayfs/inode.c
+++ b/fs/relayfs/inode.c
@@ -26,7 +26,6 @@
 
 static struct vfsmount *		relayfs_mount;
 static int				relayfs_mount_count;
-static kmem_cache_t *			relayfs_inode_cachep;
 
 static struct backing_dev_info		relayfs_backing_dev_info = {
 	.ra_pages	= 0,	/* No readahead */
@@ -499,34 +498,6 @@ out:
 	return ret;
 }
 
-/**
- *	relayfs alloc_inode() implementation
- */
-static struct inode *relayfs_alloc_inode(struct super_block *sb)
-{
-	struct relayfs_inode_info *p = kmem_cache_alloc(relayfs_inode_cachep, SLAB_KERNEL);
-	if (!p)
-		return NULL;
-	p->data = NULL;
-
-	return &p->vfs_inode;
-}
-
-/**
- *	relayfs destroy_inode() implementation
- */
-static void relayfs_destroy_inode(struct inode *inode)
-{
-	kmem_cache_free(relayfs_inode_cachep, RELAYFS_I(inode));
-}
-
-static void init_once(void *p, kmem_cache_t *cachep, unsigned long flags)
-{
-	struct relayfs_inode_info *i = p;
-	if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) == SLAB_CTOR_CONSTRUCTOR)
-		inode_init_once(&i->vfs_inode);
-}
-
 struct file_operations relayfs_file_operations = {
 	.open		= relayfs_open,
 	.poll		= relayfs_poll,
@@ -539,8 +510,6 @@ struct file_operations relayfs_file_operations = {
 static struct super_operations relayfs_ops = {
 	.statfs		= simple_statfs,
 	.drop_inode	= generic_delete_inode,
-	.alloc_inode	= relayfs_alloc_inode,
-	.destroy_inode	= relayfs_destroy_inode,
 };
 
 static int relayfs_fill_super(struct super_block * sb, void * data, int silent)
@@ -584,25 +553,12 @@ static struct file_system_type relayfs_fs_type = {
 
 static int __init init_relayfs_fs(void)
 {
-	int err;
-
-	relayfs_inode_cachep = kmem_cache_create("relayfs_inode_cache",
-				sizeof(struct relayfs_inode_info), 0,
-				0, init_once, NULL);
-	if (!relayfs_inode_cachep)
-		return -ENOMEM;
-
-	err = register_filesystem(&relayfs_fs_type);
-	if (err)
-		kmem_cache_destroy(relayfs_inode_cachep);
-
-	return err;
+	return register_filesystem(&relayfs_fs_type);
 }
 
 static void __exit exit_relayfs_fs(void)
 {
 	unregister_filesystem(&relayfs_fs_type);
-	kmem_cache_destroy(relayfs_inode_cachep);
 }
 
 module_init(init_relayfs_fs)
diff --git a/include/linux/relayfs_fs.h b/include/linux/relayfs_fs.h
index 921540b1cd..8200ecbe6e 100644
--- a/include/linux/relayfs_fs.h
+++ b/include/linux/relayfs_fs.h
@@ -64,20 +64,6 @@ struct rchan
 	struct rchan_buf *buf[NR_CPUS]; /* per-cpu channel buffers */
 };
 
-/*
- * Relayfs inode
- */
-struct relayfs_inode_info
-{
-	struct inode vfs_inode;
-	void *data;
-};
-
-static inline struct relayfs_inode_info *RELAYFS_I(struct inode *inode)
-{
-	return container_of(inode, struct relayfs_inode_info, vfs_inode);
-}
-
 /*
  * Relay channel client callbacks
  */
-- 
cgit v1.2.2


From 925ac8a2b637466ba0ad8dfaf7b49aa9a362502f Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@us.ibm.com>
Date: Sun, 8 Jan 2006 01:02:27 -0800
Subject: [PATCH] relayfs: add Documention for non-relay files

Documentation update for non-relay files.

Signed-off-by: Tom Zanussi <zanussi@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/filesystems/relayfs.txt | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/Documentation/filesystems/relayfs.txt b/Documentation/filesystems/relayfs.txt
index d803abed29..0720a049d0 100644
--- a/Documentation/filesystems/relayfs.txt
+++ b/Documentation/filesystems/relayfs.txt
@@ -125,6 +125,8 @@ Here's a summary of the API relayfs provides to in-kernel clients:
     relay_reset(chan)
     relayfs_create_dir(name, parent)
     relayfs_remove_dir(dentry)
+    relayfs_create_file(name, parent, mode, fops, data)
+    relayfs_remove_file(dentry)
 
   channel management typically called on instigation of userspace:
 
@@ -320,6 +322,27 @@ forces a sub-buffer switch on all the channel buffers, and can be used
 to finalize and process the last sub-buffers before the channel is
 closed.
 
+Creating non-relay files
+------------------------
+
+relay_open() automatically creates files in the relayfs filesystem to
+represent the per-cpu kernel buffers; it's often useful for
+applications to be able to create their own files alongside the relay
+files in the relayfs filesystem as well e.g. 'control' files much like
+those created in /proc or debugfs for similar purposes, used to
+communicate control information between the kernel and user sides of a
+relayfs application.  For this purpose the relayfs_create_file() and
+relayfs_remove_file() API functions exist.  For relayfs_create_file(),
+the caller passes in a set of user-defined file operations to be used
+for the file and an optional void * to a user-specified data item,
+which will be accessible via inode->u.generic_ip (see the relay-apps
+tarball for examples).  The file_operations are a required parameter
+to relayfs_create_file() and thus the semantics of these files are
+completely defined by the caller.
+
+See the relay-apps tarball at http://relayfs.sourceforge.net for
+examples of how these non-relay files are meant to be used.
+
 Misc
 ----
 
-- 
cgit v1.2.2


From 08c541a7ade230883c48225f4ea406a0117e7c2f Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@us.ibm.com>
Date: Sun, 8 Jan 2006 01:02:28 -0800
Subject: [PATCH] relayfs: add support for relay files in other filesystems

This patch adds a couple of callback functions that allow a client to hook
into relay_open()/close() and supply the files that will be used to represent
the channel buffers; the default implementation if no callbacks are defined is
to create the files in relayfs.  This is to support the creation and use of
relay files in other filesystems such as debugfs, as implied by the fact that
relayfs_file_operations are exported.

Signed-off-by: Tom Zanussi <zanussi@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/relayfs/buffers.c       |  2 +-
 fs/relayfs/relay.c         | 30 ++++++++++++++++++++++++++++--
 include/linux/relayfs_fs.h | 34 ++++++++++++++++++++++++++++++++++
 3 files changed, 63 insertions(+), 3 deletions(-)

diff --git a/fs/relayfs/buffers.c b/fs/relayfs/buffers.c
index 667b529944..1018781277 100644
--- a/fs/relayfs/buffers.c
+++ b/fs/relayfs/buffers.c
@@ -185,6 +185,6 @@ void relay_destroy_buf(struct rchan_buf *buf)
 void relay_remove_buf(struct kref *kref)
 {
 	struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref);
-	relayfs_remove(buf->dentry);
+	buf->chan->cb->remove_buf_file(buf->dentry);
 	relay_destroy_buf(buf);
 }
diff --git a/fs/relayfs/relay.c b/fs/relayfs/relay.c
index a9cd5585c4..b9bb569032 100644
--- a/fs/relayfs/relay.c
+++ b/fs/relayfs/relay.c
@@ -80,11 +80,33 @@ static void buf_unmapped_default_callback(struct rchan_buf *buf,
 {
 }
 
+/*
+ * create_buf_file_create() default callback.  Creates file to represent buf.
+ */
+static struct dentry *create_buf_file_default_callback(const char *filename,
+						       struct dentry *parent,
+						       int mode,
+						       struct rchan_buf *buf)
+{
+	return relayfs_create_file(filename, parent, mode,
+				   &relayfs_file_operations, buf);
+}
+
+/*
+ * remove_buf_file() default callback.  Removes file representing relay buffer.
+ */
+static int remove_buf_file_default_callback(struct dentry *dentry)
+{
+	return relayfs_remove(dentry);
+}
+
 /* relay channel default callbacks */
 static struct rchan_callbacks default_channel_callbacks = {
 	.subbuf_start = subbuf_start_default_callback,
 	.buf_mapped = buf_mapped_default_callback,
 	.buf_unmapped = buf_unmapped_default_callback,
+	.create_buf_file = create_buf_file_default_callback,
+	.remove_buf_file = remove_buf_file_default_callback,
 };
 
 /**
@@ -176,8 +198,8 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan,
  		return NULL;
 
 	/* Create file in fs */
-	dentry = relayfs_create_file(filename, parent, S_IRUSR,
-				     &relayfs_file_operations, buf);
+ 	dentry = chan->cb->create_buf_file(filename, parent, S_IRUSR,
+ 					   buf);
  	if (!dentry) {
  		relay_destroy_buf(buf);
 		return NULL;
@@ -220,6 +242,10 @@ static inline void setup_callbacks(struct rchan *chan,
 		cb->buf_mapped = buf_mapped_default_callback;
 	if (!cb->buf_unmapped)
 		cb->buf_unmapped = buf_unmapped_default_callback;
+	if (!cb->create_buf_file)
+		cb->create_buf_file = create_buf_file_default_callback;
+	if (!cb->remove_buf_file)
+		cb->remove_buf_file = remove_buf_file_default_callback;
 	chan->cb = cb;
 }
 
diff --git a/include/linux/relayfs_fs.h b/include/linux/relayfs_fs.h
index 8200ecbe6e..8c21771058 100644
--- a/include/linux/relayfs_fs.h
+++ b/include/linux/relayfs_fs.h
@@ -110,6 +110,40 @@ struct rchan_callbacks
 	 */
         void (*buf_unmapped)(struct rchan_buf *buf,
 			     struct file *filp);
+	/*
+	 * create_buf_file - create file to represent a relayfs channel buffer
+	 * @filename: the name of the file to create
+	 * @parent: the parent of the file to create
+	 * @mode: the mode of the file to create
+	 * @buf: the channel buffer
+	 *
+	 * Called during relay_open(), once for each per-cpu buffer,
+	 * to allow the client to create a file to be used to
+	 * represent the corresponding channel buffer.  If the file is
+	 * created outside of relayfs, the parent must also exist in
+	 * that filesystem.
+	 *
+	 * The callback should return the dentry of the file created
+	 * to represent the relay buffer.
+	 *
+	 * See Documentation/filesystems/relayfs.txt for more info.
+	 */
+	struct dentry *(*create_buf_file)(const char *filename,
+					  struct dentry *parent,
+					  int mode,
+					  struct rchan_buf *buf);
+
+	/*
+	 * remove_buf_file - remove file representing a relayfs channel buffer
+	 * @dentry: the dentry of the file to remove
+	 *
+	 * Called during relay_close(), once for each per-cpu buffer,
+	 * to allow the client to remove a file used to represent a
+	 * channel buffer.
+	 *
+	 * The callback should return 0 if successful, negative if not.
+	 */
+	int (*remove_buf_file)(struct dentry *dentry);
 };
 
 /*
-- 
cgit v1.2.2


From 03d78d11d92b5ed688bb18167b05d9d01493e175 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@us.ibm.com>
Date: Sun, 8 Jan 2006 01:02:29 -0800
Subject: [PATCH] relayfs: add Documentation on relay files in other
 filesystems

Documentation update for creating relay files in other filesystems.

Signed-off-by: Tom Zanussi <zanussi@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/filesystems/relayfs.txt | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/Documentation/filesystems/relayfs.txt b/Documentation/filesystems/relayfs.txt
index 0720a049d0..4221b3a52e 100644
--- a/Documentation/filesystems/relayfs.txt
+++ b/Documentation/filesystems/relayfs.txt
@@ -143,6 +143,8 @@ Here's a summary of the API relayfs provides to in-kernel clients:
     subbuf_start(buf, subbuf, prev_subbuf, prev_padding)
     buf_mapped(buf, filp)
     buf_unmapped(buf, filp)
+    create_buf_file(filename, parent, mode, buf)
+    remove_buf_file(dentry)
 
   helper functions:
 
@@ -343,6 +345,31 @@ completely defined by the caller.
 See the relay-apps tarball at http://relayfs.sourceforge.net for
 examples of how these non-relay files are meant to be used.
 
+Creating relay files in other filesystems
+-----------------------------------------
+
+By default of course, relay_open() creates relay files in the relayfs
+filesystem.  Because relay_file_operations is exported, however, it's
+also possible to create and use relay files in other pseudo-filesytems
+such as debugfs.
+
+For this purpose, two callback functions are provided,
+create_buf_file() and remove_buf_file().  create_buf_file() is called
+once for each per-cpu buffer from relay_open() to allow the client to
+create a file to be used to represent the corresponding buffer; if
+this callback is not defined, the default implementation will create
+and return a file in the relayfs filesystem to represent the buffer.
+The callback should return the dentry of the file created to represent
+the relay buffer.  Note that the parent directory passed to
+relay_open() (and passed along to the callback), if specified, must
+exist in the same filesystem the new relay file is created in.  If
+create_buf_file() is defined, remove_buf_file() must also be defined;
+it's responsible for deleting the file(s) created in create_buf_file()
+and is called during relay_close().
+
+See the 'exported-relayfile' examples in the relay-apps tarball for
+examples of creating and using relay files in debugfs.
+
 Misc
 ----
 
-- 
cgit v1.2.2


From e6c08367b8fc6dce6dfd1106f53f6ef28215b313 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@us.ibm.com>
Date: Sun, 8 Jan 2006 01:02:29 -0800
Subject: [PATCH] relayfs: add support for global relay buffers

This patch adds the optional is_global outparam to the create_buf_file()
callback.  This can be used by clients to create a single global relayfs
buffer instead of the default per-cpu buffers.  This was suggested as being
useful for certain debugging applications where it's more convenient to be
able to get all the data from a single channel without having to go to the
bother of dealing with per-cpu files.

Signed-off-by: Tom Zanussi <zanussi@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/relayfs/relay.c         | 35 +++++++++++++++++++++++++----------
 include/linux/relayfs_fs.h |  8 +++++++-
 2 files changed, 32 insertions(+), 11 deletions(-)

diff --git a/fs/relayfs/relay.c b/fs/relayfs/relay.c
index b9bb569032..2935a6ab8f 100644
--- a/fs/relayfs/relay.c
+++ b/fs/relayfs/relay.c
@@ -86,7 +86,8 @@ static void buf_unmapped_default_callback(struct rchan_buf *buf,
 static struct dentry *create_buf_file_default_callback(const char *filename,
 						       struct dentry *parent,
 						       int mode,
-						       struct rchan_buf *buf)
+						       struct rchan_buf *buf,
+						       int *is_global)
 {
 	return relayfs_create_file(filename, parent, mode,
 				   &relayfs_file_operations, buf);
@@ -170,14 +171,16 @@ static inline void __relay_reset(struct rchan_buf *buf, unsigned int init)
 void relay_reset(struct rchan *chan)
 {
 	unsigned int i;
+	struct rchan_buf *prev = NULL;
 
 	if (!chan)
 		return;
 
 	for (i = 0; i < NR_CPUS; i++) {
-		if (!chan->buf[i])
-			continue;
+		if (!chan->buf[i] || chan->buf[i] == prev)
+			break;
 		__relay_reset(chan->buf[i], 0);
+		prev = chan->buf[i];
 	}
 }
 
@@ -188,18 +191,22 @@ void relay_reset(struct rchan *chan)
  */
 static struct rchan_buf *relay_open_buf(struct rchan *chan,
 					const char *filename,
-					struct dentry *parent)
+					struct dentry *parent,
+					int *is_global)
 {
 	struct rchan_buf *buf;
 	struct dentry *dentry;
 
+	if (*is_global)
+		return chan->buf[0];
+
  	buf = relay_create_buf(chan);
  	if (!buf)
  		return NULL;
 
 	/* Create file in fs */
  	dentry = chan->cb->create_buf_file(filename, parent, S_IRUSR,
- 					   buf);
+ 					   buf, is_global);
  	if (!dentry) {
  		relay_destroy_buf(buf);
 		return NULL;
@@ -273,6 +280,7 @@ struct rchan *relay_open(const char *base_filename,
 	unsigned int i;
 	struct rchan *chan;
 	char *tmpname;
+	int is_global = 0;
 
 	if (!base_filename)
 		return NULL;
@@ -297,7 +305,8 @@ struct rchan *relay_open(const char *base_filename,
 
 	for_each_online_cpu(i) {
 		sprintf(tmpname, "%s%d", base_filename, i);
-		chan->buf[i] = relay_open_buf(chan, tmpname, parent);
+		chan->buf[i] = relay_open_buf(chan, tmpname, parent,
+					      &is_global);
 		chan->buf[i]->cpu = i;
 		if (!chan->buf[i])
 			goto free_bufs;
@@ -311,6 +320,8 @@ free_bufs:
 		if (!chan->buf[i])
 			break;
 		relay_close_buf(chan->buf[i]);
+		if (is_global)
+			break;
 	}
 	kfree(tmpname);
 
@@ -420,14 +431,16 @@ void relay_destroy_channel(struct kref *kref)
 void relay_close(struct rchan *chan)
 {
 	unsigned int i;
+	struct rchan_buf *prev = NULL;
 
 	if (!chan)
 		return;
 
 	for (i = 0; i < NR_CPUS; i++) {
-		if (!chan->buf[i])
-			continue;
+		if (!chan->buf[i] || chan->buf[i] == prev)
+			break;
 		relay_close_buf(chan->buf[i]);
+		prev = chan->buf[i];
 	}
 
 	if (chan->last_toobig)
@@ -447,14 +460,16 @@ void relay_close(struct rchan *chan)
 void relay_flush(struct rchan *chan)
 {
 	unsigned int i;
+	struct rchan_buf *prev = NULL;
 
 	if (!chan)
 		return;
 
 	for (i = 0; i < NR_CPUS; i++) {
-		if (!chan->buf[i])
-			continue;
+		if (!chan->buf[i] || chan->buf[i] == prev)
+			break;
 		relay_switch_subbuf(chan->buf[i], 0);
+		prev = chan->buf[i];
 	}
 }
 
diff --git a/include/linux/relayfs_fs.h b/include/linux/relayfs_fs.h
index 8c21771058..30f45511b4 100644
--- a/include/linux/relayfs_fs.h
+++ b/include/linux/relayfs_fs.h
@@ -116,6 +116,7 @@ struct rchan_callbacks
 	 * @parent: the parent of the file to create
 	 * @mode: the mode of the file to create
 	 * @buf: the channel buffer
+	 * @is_global: outparam - set non-zero if the buffer should be global
 	 *
 	 * Called during relay_open(), once for each per-cpu buffer,
 	 * to allow the client to create a file to be used to
@@ -126,12 +127,17 @@ struct rchan_callbacks
 	 * The callback should return the dentry of the file created
 	 * to represent the relay buffer.
 	 *
+	 * Setting the is_global outparam to a non-zero value will
+	 * cause relay_open() to create a single global buffer rather
+	 * than the default set of per-cpu buffers.
+	 *
 	 * See Documentation/filesystems/relayfs.txt for more info.
 	 */
 	struct dentry *(*create_buf_file)(const char *filename,
 					  struct dentry *parent,
 					  int mode,
-					  struct rchan_buf *buf);
+					  struct rchan_buf *buf,
+					  int *is_global);
 
 	/*
 	 * remove_buf_file - remove file representing a relayfs channel buffer
-- 
cgit v1.2.2


From df49af8f3392e96ddbc9cf6dd81fbdc0496e1b44 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@us.ibm.com>
Date: Sun, 8 Jan 2006 01:02:30 -0800
Subject: [PATCH] relayfs: add Documentation on global relay buffers

Documentation update for creating global buffers.

Signed-off-by: Tom Zanussi <zanussi@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/filesystems/relayfs.txt | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/Documentation/filesystems/relayfs.txt b/Documentation/filesystems/relayfs.txt
index 4221b3a52e..d9693cb860 100644
--- a/Documentation/filesystems/relayfs.txt
+++ b/Documentation/filesystems/relayfs.txt
@@ -143,7 +143,7 @@ Here's a summary of the API relayfs provides to in-kernel clients:
     subbuf_start(buf, subbuf, prev_subbuf, prev_padding)
     buf_mapped(buf, filp)
     buf_unmapped(buf, filp)
-    create_buf_file(filename, parent, mode, buf)
+    create_buf_file(filename, parent, mode, buf, is_global)
     remove_buf_file(dentry)
 
   helper functions:
@@ -367,6 +367,25 @@ create_buf_file() is defined, remove_buf_file() must also be defined;
 it's responsible for deleting the file(s) created in create_buf_file()
 and is called during relay_close().
 
+The create_buf_file() implementation can also be defined in such a way
+as to allow the creation of a single 'global' buffer instead of the
+default per-cpu set.  This can be useful for applications interested
+mainly in seeing the relative ordering of system-wide events without
+the need to bother with saving explicit timestamps for the purpose of
+merging/sorting per-cpu files in a postprocessing step.
+
+To have relay_open() create a global buffer, the create_buf_file()
+implementation should set the value of the is_global outparam to a
+non-zero value in addition to creating the file that will be used to
+represent the single buffer.  In the case of a global buffer,
+create_buf_file() and remove_buf_file() will be called only once.  The
+normal channel-writing functions e.g. relay_write() can still be used
+- writes from any cpu will transparently end up in the global buffer -
+but since it is a global buffer, callers should make sure they use the
+proper locking for such a buffer, either by wrapping writes in a
+spinlock, or by copying a write function from relayfs_fs.h and
+creating a local version that internally does the proper locking.
+
 See the 'exported-relayfile' examples in the relay-apps tarball for
 examples of creating and using relay files in debugfs.
 
-- 
cgit v1.2.2


From 761da5c88aca34586e5b7295bd8b9be2438906f2 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@us.ibm.com>
Date: Sun, 8 Jan 2006 01:02:31 -0800
Subject: [PATCH] relayfs: cleanup, change relayfs_file_* to relay_file_*

This patch renames relayfs_file_operations to relay_file_operations, and the
file operations themselves from relayfs_XXX to relay_file_XXX, to make it more
clear that they refer to relay files.

Signed-off-by: Tom Zanussi <zanussi@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/relayfs/inode.c         | 89 ++++++++++++++++++++++++----------------------
 fs/relayfs/relay.c         |  2 +-
 include/linux/relayfs_fs.h |  5 ++-
 3 files changed, 50 insertions(+), 46 deletions(-)

diff --git a/fs/relayfs/inode.c b/fs/relayfs/inode.c
index b4c3e0466e..7b7f2cb5f0 100644
--- a/fs/relayfs/inode.c
+++ b/fs/relayfs/inode.c
@@ -247,13 +247,13 @@ int relayfs_remove_dir(struct dentry *dentry)
 }
 
 /**
- *	relayfs_open - open file op for relayfs files
+ *	relay_file_open - open file op for relay files
  *	@inode: the inode
  *	@filp: the file
  *
  *	Increments the channel buffer refcount.
  */
-static int relayfs_open(struct inode *inode, struct file *filp)
+static int relay_file_open(struct inode *inode, struct file *filp)
 {
 	struct rchan_buf *buf = inode->u.generic_ip;
 	kref_get(&buf->kref);
@@ -263,26 +263,26 @@ static int relayfs_open(struct inode *inode, struct file *filp)
 }
 
 /**
- *	relayfs_mmap - mmap file op for relayfs files
+ *	relay_file_mmap - mmap file op for relay files
  *	@filp: the file
  *	@vma: the vma describing what to map
  *
  *	Calls upon relay_mmap_buf to map the file into user space.
  */
-static int relayfs_mmap(struct file *filp, struct vm_area_struct *vma)
+static int relay_file_mmap(struct file *filp, struct vm_area_struct *vma)
 {
 	struct rchan_buf *buf = filp->private_data;
 	return relay_mmap_buf(buf, vma);
 }
 
 /**
- *	relayfs_poll - poll file op for relayfs files
+ *	relay_file_poll - poll file op for relay files
  *	@filp: the file
  *	@wait: poll table
  *
  *	Poll implemention.
  */
-static unsigned int relayfs_poll(struct file *filp, poll_table *wait)
+static unsigned int relay_file_poll(struct file *filp, poll_table *wait)
 {
 	unsigned int mask = 0;
 	struct rchan_buf *buf = filp->private_data;
@@ -300,14 +300,14 @@ static unsigned int relayfs_poll(struct file *filp, poll_table *wait)
 }
 
 /**
- *	relayfs_release - release file op for relayfs files
+ *	relay_file_release - release file op for relay files
  *	@inode: the inode
  *	@filp: the file
  *
  *	Decrements the channel refcount, as the filesystem is
  *	no longer using it.
  */
-static int relayfs_release(struct inode *inode, struct file *filp)
+static int relay_file_release(struct inode *inode, struct file *filp)
 {
 	struct rchan_buf *buf = filp->private_data;
 	kref_put(&buf->kref, relay_remove_buf);
@@ -316,11 +316,11 @@ static int relayfs_release(struct inode *inode, struct file *filp)
 }
 
 /**
- *	relayfs_read_consume - update the consumed count for the buffer
+ *	relay_file_read_consume - update the consumed count for the buffer
  */
-static void relayfs_read_consume(struct rchan_buf *buf,
-				 size_t read_pos,
-				 size_t bytes_consumed)
+static void relay_file_read_consume(struct rchan_buf *buf,
+				    size_t read_pos,
+				    size_t bytes_consumed)
 {
 	size_t subbuf_size = buf->chan->subbuf_size;
 	size_t n_subbufs = buf->chan->n_subbufs;
@@ -343,9 +343,9 @@ static void relayfs_read_consume(struct rchan_buf *buf,
 }
 
 /**
- *	relayfs_read_avail - boolean, are there unconsumed bytes available?
+ *	relay_file_read_avail - boolean, are there unconsumed bytes available?
  */
-static int relayfs_read_avail(struct rchan_buf *buf, size_t read_pos)
+static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
 {
 	size_t bytes_produced, bytes_consumed, write_offset;
 	size_t subbuf_size = buf->chan->subbuf_size;
@@ -376,16 +376,16 @@ static int relayfs_read_avail(struct rchan_buf *buf, size_t read_pos)
 	if (bytes_produced == bytes_consumed)
 		return 0;
 
-	relayfs_read_consume(buf, read_pos, 0);
+	relay_file_read_consume(buf, read_pos, 0);
 
 	return 1;
 }
 
 /**
- *	relayfs_read_subbuf_avail - return bytes available in sub-buffer
+ *	relay_file_read_subbuf_avail - return bytes available in sub-buffer
  */
-static size_t relayfs_read_subbuf_avail(size_t read_pos,
-					struct rchan_buf *buf)
+static size_t relay_file_read_subbuf_avail(size_t read_pos,
+					   struct rchan_buf *buf)
 {
 	size_t padding, avail = 0;
 	size_t read_subbuf, read_offset, write_subbuf, write_offset;
@@ -407,14 +407,14 @@ static size_t relayfs_read_subbuf_avail(size_t read_pos,
 }
 
 /**
- *	relayfs_read_start_pos - find the first available byte to read
+ *	relay_file_read_start_pos - find the first available byte to read
  *
  *	If the read_pos is in the middle of padding, return the
  *	position of the first actually available byte, otherwise
  *	return the original value.
  */
-static size_t relayfs_read_start_pos(size_t read_pos,
-				     struct rchan_buf *buf)
+static size_t relay_file_read_start_pos(size_t read_pos,
+					struct rchan_buf *buf)
 {
 	size_t read_subbuf, padding, padding_start, padding_end;
 	size_t subbuf_size = buf->chan->subbuf_size;
@@ -433,11 +433,11 @@ static size_t relayfs_read_start_pos(size_t read_pos,
 }
 
 /**
- *	relayfs_read_end_pos - return the new read position
+ *	relay_file_read_end_pos - return the new read position
  */
-static size_t relayfs_read_end_pos(struct rchan_buf *buf,
-				   size_t read_pos,
-				   size_t count)
+static size_t relay_file_read_end_pos(struct rchan_buf *buf,
+				      size_t read_pos,
+				      size_t count)
 {
 	size_t read_subbuf, padding, end_pos;
 	size_t subbuf_size = buf->chan->subbuf_size;
@@ -456,7 +456,7 @@ static size_t relayfs_read_end_pos(struct rchan_buf *buf,
 }
 
 /**
- *	relayfs_read - read file op for relayfs files
+ *	relay_file_read - read file op for relay files
  *	@filp: the file
  *	@buffer: the userspace buffer
  *	@count: number of bytes to read
@@ -465,10 +465,10 @@ static size_t relayfs_read_end_pos(struct rchan_buf *buf,
  *	Reads count bytes or the number of bytes available in the
  *	current sub-buffer being read, whichever is smaller.
  */
-static ssize_t relayfs_read(struct file *filp,
-			    char __user *buffer,
-			    size_t count,
-			    loff_t *ppos)
+static ssize_t relay_file_read(struct file *filp,
+			       char __user *buffer,
+			       size_t count,
+			       loff_t *ppos)
 {
 	struct rchan_buf *buf = filp->private_data;
 	struct inode *inode = filp->f_dentry->d_inode;
@@ -477,11 +477,11 @@ static ssize_t relayfs_read(struct file *filp,
 	void *from;
 
 	down(&inode->i_sem);
-	if(!relayfs_read_avail(buf, *ppos))
+	if(!relay_file_read_avail(buf, *ppos))
 		goto out;
 
-	read_start = relayfs_read_start_pos(*ppos, buf);
-	avail = relayfs_read_subbuf_avail(read_start, buf);
+	read_start = relay_file_read_start_pos(*ppos, buf);
+	avail = relay_file_read_subbuf_avail(read_start, buf);
 	if (!avail)
 		goto out;
 
@@ -491,20 +491,20 @@ static ssize_t relayfs_read(struct file *filp,
 		ret = -EFAULT;
 		goto out;
 	}
-	relayfs_read_consume(buf, read_start, count);
-	*ppos = relayfs_read_end_pos(buf, read_start, count);
+	relay_file_read_consume(buf, read_start, count);
+	*ppos = relay_file_read_end_pos(buf, read_start, count);
 out:
 	up(&inode->i_sem);
 	return ret;
 }
 
-struct file_operations relayfs_file_operations = {
-	.open		= relayfs_open,
-	.poll		= relayfs_poll,
-	.mmap		= relayfs_mmap,
-	.read		= relayfs_read,
+struct file_operations relay_file_operations = {
+	.open		= relay_file_open,
+	.poll		= relay_file_poll,
+	.mmap		= relay_file_mmap,
+	.read		= relay_file_read,
 	.llseek		= no_llseek,
-	.release	= relayfs_release,
+	.release	= relay_file_release,
 };
 
 static struct super_operations relayfs_ops = {
@@ -558,13 +558,18 @@ static int __init init_relayfs_fs(void)
 
 static void __exit exit_relayfs_fs(void)
 {
+
+
+
+
+
 	unregister_filesystem(&relayfs_fs_type);
 }
 
 module_init(init_relayfs_fs)
 module_exit(exit_relayfs_fs)
 
-EXPORT_SYMBOL_GPL(relayfs_file_operations);
+EXPORT_SYMBOL_GPL(relay_file_operations);
 EXPORT_SYMBOL_GPL(relayfs_create_dir);
 EXPORT_SYMBOL_GPL(relayfs_remove_dir);
 EXPORT_SYMBOL_GPL(relayfs_create_file);
diff --git a/fs/relayfs/relay.c b/fs/relayfs/relay.c
index 2935a6ab8f..abf3ceaace 100644
--- a/fs/relayfs/relay.c
+++ b/fs/relayfs/relay.c
@@ -90,7 +90,7 @@ static struct dentry *create_buf_file_default_callback(const char *filename,
 						       int *is_global)
 {
 	return relayfs_create_file(filename, parent, mode,
-				   &relayfs_file_operations, buf);
+				   &relay_file_operations, buf);
 }
 
 /*
diff --git a/include/linux/relayfs_fs.h b/include/linux/relayfs_fs.h
index 30f45511b4..7342e66247 100644
--- a/include/linux/relayfs_fs.h
+++ b/include/linux/relayfs_fs.h
@@ -279,10 +279,9 @@ static inline void subbuf_start_reserve(struct rchan_buf *buf,
 }
 
 /*
- * exported relayfs file operations, fs/relayfs/inode.c
+ * exported relay file operations, fs/relayfs/inode.c
  */
-
-extern struct file_operations relayfs_file_operations;
+extern struct file_operations relay_file_operations;
 
 #endif /* _LINUX_RELAYFS_FS_H */
 
-- 
cgit v1.2.2


From 6b34350f490b2c8508717541ec1fd2bbaadded94 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@us.ibm.com>
Date: Sun, 8 Jan 2006 01:02:32 -0800
Subject: [PATCH] relayfs: Documentation cleanup, remove obsolete info

librelay and relay-app.h have been retired - update Documentation to reflect
that.

Signed-off-by: Tom Zanussi <zanussi@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/filesystems/relayfs.txt | 57 +++++++++++++++++++++--------------
 1 file changed, 34 insertions(+), 23 deletions(-)

diff --git a/Documentation/filesystems/relayfs.txt b/Documentation/filesystems/relayfs.txt
index d9693cb860..5832377b73 100644
--- a/Documentation/filesystems/relayfs.txt
+++ b/Documentation/filesystems/relayfs.txt
@@ -44,30 +44,41 @@ relayfs can operate in a mode where it will overwrite data not yet
 collected by userspace, and not wait for it to consume it.
 
 relayfs itself does not provide for communication of such data between
-userspace and kernel, allowing the kernel side to remain simple and not
-impose a single interface on userspace. It does provide a separate
-helper though, described below.
+userspace and kernel, allowing the kernel side to remain simple and
+not impose a single interface on userspace. It does provide a set of
+examples and a separate helper though, described below.
+
+klog and relay-apps example code
+================================
+
+relayfs itself is ready to use, but to make things easier, a couple
+simple utility functions and a set of examples are provided.
+
+The relay-apps example tarball, available on the relayfs sourceforge
+site, contains a set of self-contained examples, each consisting of a
+pair of .c files containing boilerplate code for each of the user and
+kernel sides of a relayfs application; combined these two sets of
+boilerplate code provide glue to easily stream data to disk, without
+having to bother with mundane housekeeping chores.
+
+The 'klog debugging functions' patch (klog.patch in the relay-apps
+tarball) provides a couple of high-level logging functions to the
+kernel which allow writing formatted text or raw data to a channel,
+regardless of whether a channel to write into exists or not, or
+whether relayfs is compiled into the kernel or is configured as a
+module.  These functions allow you to put unconditional 'trace'
+statements anywhere in the kernel or kernel modules; only when there
+is a 'klog handler' registered will data actually be logged (see the
+klog and kleak examples for details).
+
+It is of course possible to use relayfs from scratch i.e. without
+using any of the relay-apps example code or klog, but you'll have to
+implement communication between userspace and kernel, allowing both to
+convey the state of buffers (full, empty, amount of padding).
+
+klog and the relay-apps examples can be found in the relay-apps
+tarball on http://relayfs.sourceforge.net
 
-klog, relay-app & librelay
-==========================
-
-relayfs itself is ready to use, but to make things easier, two
-additional systems are provided.  klog is a simple wrapper to make
-writing formatted text or raw data to a channel simpler, regardless of
-whether a channel to write into exists or not, or whether relayfs is
-compiled into the kernel or is configured as a module.  relay-app is
-the kernel counterpart of userspace librelay.c, combined these two
-files provide glue to easily stream data to disk, without having to
-bother with housekeeping.  klog and relay-app can be used together,
-with klog providing high-level logging functions to the kernel and
-relay-app taking care of kernel-user control and disk-logging chores.
-
-It is possible to use relayfs without relay-app & librelay, but you'll
-have to implement communication between userspace and kernel, allowing
-both to convey the state of buffers (full, empty, amount of padding).
-
-klog, relay-app and librelay can be found in the relay-apps tarball on
-http://relayfs.sourceforge.net
 
 The relayfs user space API
 ==========================
-- 
cgit v1.2.2


From 6b9c7ed84837753a436415097063232422e29a35 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 8 Jan 2006 01:02:33 -0800
Subject: [PATCH] use ptrace_get_task_struct in various places

The ptrace_get_task_struct() helper that I added as part of the ptrace
consolidation is useful in variety of places that currently opencode it.
Switch them to the common helpers.

Add a ptrace_traceme() helper that needs to be explicitly called, and simplify
the ptrace_get_task_struct() interface.  We don't need the request argument
now, and we return the task_struct directly, using ERR_PTR() for error
returns.  It's a bit more code in the callers, but we have two sane routines
that do one thing well now.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/alpha/kernel/ptrace.c     | 24 +++----------
 arch/ia64/ia32/sys_ia32.c      | 16 +++------
 arch/ia64/kernel/ptrace.c      |  9 +----
 arch/m32r/kernel/ptrace.c      | 22 +++---------
 arch/mips/kernel/ptrace32.c    | 26 ++++----------
 arch/powerpc/kernel/ptrace32.c | 28 ++++-----------
 arch/s390/kernel/ptrace.c      | 29 ++++------------
 arch/sparc/kernel/ptrace.c     | 35 ++++---------------
 arch/sparc64/kernel/ptrace.c   | 34 +++----------------
 arch/x86_64/ia32/ptrace32.c    | 44 ++++++------------------
 include/linux/ptrace.h         |  2 ++
 kernel/ptrace.c                | 77 +++++++++++++++++++++++++-----------------
 12 files changed, 105 insertions(+), 241 deletions(-)

diff --git a/arch/alpha/kernel/ptrace.c b/arch/alpha/kernel/ptrace.c
index bbd37536d1..9969d212e9 100644
--- a/arch/alpha/kernel/ptrace.c
+++ b/arch/alpha/kernel/ptrace.c
@@ -265,30 +265,16 @@ do_sys_ptrace(long request, long pid, long addr, long data,
 	lock_kernel();
 	DBG(DBG_MEM, ("request=%ld pid=%ld addr=0x%lx data=0x%lx\n",
 		      request, pid, addr, data));
-	ret = -EPERM;
 	if (request == PTRACE_TRACEME) {
-		/* are we already being traced? */
-		if (current->ptrace & PT_PTRACED)
-			goto out_notsk;
-		ret = security_ptrace(current->parent, current);
-		if (ret)
-			goto out_notsk;
-		/* set the ptrace bit in the process ptrace flags. */
-		current->ptrace |= PT_PTRACED;
-		ret = 0;
+		ret = ptrace_traceme();
 		goto out_notsk;
 	}
-	if (pid == 1)		/* you may not mess with init */
-		goto out_notsk;
 
-	ret = -ESRCH;
-	read_lock(&tasklist_lock);
-	child = find_task_by_pid(pid);
-	if (child)
-		get_task_struct(child);
-	read_unlock(&tasklist_lock);
-	if (!child)
+	child = ptrace_get_task_struct(pid);
+	if (IS_ERR(child)) {
+		ret = PTR_ERR(child);
 		goto out_notsk;
+	}
 
 	if (request == PTRACE_ATTACH) {
 		ret = ptrace_attach(child);
diff --git a/arch/ia64/ia32/sys_ia32.c b/arch/ia64/ia32/sys_ia32.c
index dc28271042..9f8e8d5588 100644
--- a/arch/ia64/ia32/sys_ia32.c
+++ b/arch/ia64/ia32/sys_ia32.c
@@ -1761,21 +1761,15 @@ sys32_ptrace (int request, pid_t pid, unsigned int addr, unsigned int data)
 
 	lock_kernel();
 	if (request == PTRACE_TRACEME) {
-		ret = sys_ptrace(request, pid, addr, data);
+		ret = ptrace_traceme();
 		goto out;
 	}
 
-	ret = -ESRCH;
-	read_lock(&tasklist_lock);
-	child = find_task_by_pid(pid);
-	if (child)
-		get_task_struct(child);
-	read_unlock(&tasklist_lock);
-	if (!child)
+	child = ptrace_get_task_struct(pid);
+	if (IS_ERR(child)) {
+		ret = PTR_ERR(child);
 		goto out;
-	ret = -EPERM;
-	if (pid == 1)		/* no messing around with init! */
-		goto out_tsk;
+	}
 
 	if (request == PTRACE_ATTACH) {
 		ret = sys_ptrace(request, pid, addr, data);
diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c
index 4b19d04106..8d88eeea02 100644
--- a/arch/ia64/kernel/ptrace.c
+++ b/arch/ia64/kernel/ptrace.c
@@ -1422,14 +1422,7 @@ sys_ptrace (long request, pid_t pid, unsigned long addr, unsigned long data)
 	lock_kernel();
 	ret = -EPERM;
 	if (request == PTRACE_TRACEME) {
-		/* are we already being traced? */
-		if (current->ptrace & PT_PTRACED)
-			goto out;
-		ret = security_ptrace(current->parent, current);
-		if (ret)
-			goto out;
-		current->ptrace |= PT_PTRACED;
-		ret = 0;
+		ret = ptrace_traceme();
 		goto out;
 	}
 
diff --git a/arch/m32r/kernel/ptrace.c b/arch/m32r/kernel/ptrace.c
index 078d2a0e71..9b75caaf5c 100644
--- a/arch/m32r/kernel/ptrace.c
+++ b/arch/m32r/kernel/ptrace.c
@@ -762,28 +762,16 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
 	int ret;
 
 	lock_kernel();
-	ret = -EPERM;
 	if (request == PTRACE_TRACEME) {
-		/* are we already being traced? */
-		if (current->ptrace & PT_PTRACED)
-			goto out;
-		/* set the ptrace bit in the process flags. */
-		current->ptrace |= PT_PTRACED;
-		ret = 0;
+		ret = ptrace_traceme();
 		goto out;
 	}
-	ret = -ESRCH;
-	read_lock(&tasklist_lock);
-	child = find_task_by_pid(pid);
-	if (child)
-		get_task_struct(child);
-	read_unlock(&tasklist_lock);
-	if (!child)
-		goto out;
 
-	ret = -EPERM;
-	if (pid == 1)		/* you may not mess with init */
+	child = ptrace_get_task_struct(pid);
+	if (IS_ERR(child)) {
+		ret = PTR_ERR(child);
 		goto out;
+	}
 
 	if (request == PTRACE_ATTACH) {
 		ret = ptrace_attach(child);
diff --git a/arch/mips/kernel/ptrace32.c b/arch/mips/kernel/ptrace32.c
index 9a9b049721..7e55457a49 100644
--- a/arch/mips/kernel/ptrace32.c
+++ b/arch/mips/kernel/ptrace32.c
@@ -57,30 +57,16 @@ asmlinkage int sys32_ptrace(int request, int pid, int addr, int data)
 	       (unsigned long) data);
 #endif
 	lock_kernel();
-	ret = -EPERM;
 	if (request == PTRACE_TRACEME) {
-		/* are we already being traced? */
-		if (current->ptrace & PT_PTRACED)
-			goto out;
-		if ((ret = security_ptrace(current->parent, current)))
-			goto out;
-		/* set the ptrace bit in the process flags. */
-		current->ptrace |= PT_PTRACED;
-		ret = 0;
+		ret = ptrace_traceme();
 		goto out;
 	}
-	ret = -ESRCH;
-	read_lock(&tasklist_lock);
-	child = find_task_by_pid(pid);
-	if (child)
-		get_task_struct(child);
-	read_unlock(&tasklist_lock);
-	if (!child)
-		goto out;
 
-	ret = -EPERM;
-	if (pid == 1)		/* you may not mess with init */
-		goto out_tsk;
+	child = ptrace_get_task_struct(pid);
+	if (IS_ERR(child)) {
+		ret = PTR_ERR(child);
+		goto out;
+	}
 
 	if (request == PTRACE_ATTACH) {
 		ret = ptrace_attach(child);
diff --git a/arch/powerpc/kernel/ptrace32.c b/arch/powerpc/kernel/ptrace32.c
index 61762640b8..826ee3d056 100644
--- a/arch/powerpc/kernel/ptrace32.c
+++ b/arch/powerpc/kernel/ptrace32.c
@@ -45,33 +45,19 @@ long compat_sys_ptrace(int request, int pid, unsigned long addr,
 		       unsigned long data)
 {
 	struct task_struct *child;
-	int ret = -EPERM;
+	int ret;
 
 	lock_kernel();
 	if (request == PTRACE_TRACEME) {
-		/* are we already being traced? */
-		if (current->ptrace & PT_PTRACED)
-			goto out;
-		ret = security_ptrace(current->parent, current);
-		if (ret)
-			goto out;
-		/* set the ptrace bit in the process flags. */
-		current->ptrace |= PT_PTRACED;
-		ret = 0;
+		ret = ptrace_traceme();
 		goto out;
 	}
-	ret = -ESRCH;
-	read_lock(&tasklist_lock);
-	child = find_task_by_pid(pid);
-	if (child)
-		get_task_struct(child);
-	read_unlock(&tasklist_lock);
-	if (!child)
-		goto out;
 
-	ret = -EPERM;
-	if (pid == 1)		/* you may not mess with init */
-		goto out_tsk;
+	child = ptrace_get_task_struct(pid);
+	if (IS_ERR(child)) {
+		ret = PTR_ERR(child);
+		goto out;
+	}
 
 	if (request == PTRACE_ATTACH) {
 		ret = ptrace_attach(child);
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index 8ecda6d66d..cc02232aa9 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -712,35 +712,18 @@ sys_ptrace(long request, long pid, long addr, long data)
 	int ret;
 
 	lock_kernel();
-
 	if (request == PTRACE_TRACEME) {
-		/* are we already being traced? */
-		ret = -EPERM;
-		if (current->ptrace & PT_PTRACED)
-			goto out;
-		ret = security_ptrace(current->parent, current);
-		if (ret)
-			goto out;
-		/* set the ptrace bit in the process flags. */
-		current->ptrace |= PT_PTRACED;
-		goto out;
+		 ret = ptrace_traceme();
+		 goto out;
 	}
 
-	ret = -EPERM;
-	if (pid == 1)		/* you may not mess with init */
-		goto out;
-
-	ret = -ESRCH;
-	read_lock(&tasklist_lock);
-	child = find_task_by_pid(pid);
-	if (child)
-		get_task_struct(child);
-	read_unlock(&tasklist_lock);
-	if (!child)
+	child = ptrace_get_task_struct(pid);
+	if (IS_ERR(child)) {
+		ret = PTR_ERR(child);
 		goto out;
+	}
 
 	ret = do_ptrace(child, request, addr, data);
-
 	put_task_struct(child);
 out:
 	unlock_kernel();
diff --git a/arch/sparc/kernel/ptrace.c b/arch/sparc/kernel/ptrace.c
index 475c4c1346..fc470c0e9d 100644
--- a/arch/sparc/kernel/ptrace.c
+++ b/arch/sparc/kernel/ptrace.c
@@ -286,40 +286,17 @@ asmlinkage void do_ptrace(struct pt_regs *regs)
 			       s, (int) request, (int) pid, addr, data, addr2);
 	}
 #endif
-	if (request == PTRACE_TRACEME) {
-		int my_ret;
-
-		/* are we already being traced? */
-		if (current->ptrace & PT_PTRACED) {
-			pt_error_return(regs, EPERM);
-			goto out;
-		}
-		my_ret = security_ptrace(current->parent, current);
-		if (my_ret) {
-			pt_error_return(regs, -my_ret);
-			goto out;
-		}
 
-		/* set the ptrace bit in the process flags. */
-		current->ptrace |= PT_PTRACED;
+	if (request == PTRACE_TRACEME) {
+		ret = ptrace_traceme();
 		pt_succ_return(regs, 0);
 		goto out;
 	}
-#ifndef ALLOW_INIT_TRACING
-	if (pid == 1) {
-		/* Can't dork with init. */
-		pt_error_return(regs, EPERM);
-		goto out;
-	}
-#endif
-	read_lock(&tasklist_lock);
-	child = find_task_by_pid(pid);
-	if (child)
-		get_task_struct(child);
-	read_unlock(&tasklist_lock);
 
-	if (!child) {
-		pt_error_return(regs, ESRCH);
+	child = ptrace_get_task_struct(pid);
+	if (IS_ERR(child)) {
+		ret = PTR_ERR(child);
+		pt_error_return(regs, -ret);
 		goto out;
 	}
 
diff --git a/arch/sparc64/kernel/ptrace.c b/arch/sparc64/kernel/ptrace.c
index 774ecbb8a0..84d3df2264 100644
--- a/arch/sparc64/kernel/ptrace.c
+++ b/arch/sparc64/kernel/ptrace.c
@@ -198,39 +198,15 @@ asmlinkage void do_ptrace(struct pt_regs *regs)
 	}
 #endif
 	if (request == PTRACE_TRACEME) {
-		int ret;
-
-		/* are we already being traced? */
-		if (current->ptrace & PT_PTRACED) {
-			pt_error_return(regs, EPERM);
-			goto out;
-		}
-		ret = security_ptrace(current->parent, current);
-		if (ret) {
-			pt_error_return(regs, -ret);
-			goto out;
-		}
-
-		/* set the ptrace bit in the process flags. */
-		current->ptrace |= PT_PTRACED;
+		ret = ptrace_traceme();
 		pt_succ_return(regs, 0);
 		goto out;
 	}
-#ifndef ALLOW_INIT_TRACING
-	if (pid == 1) {
-		/* Can't dork with init. */
-		pt_error_return(regs, EPERM);
-		goto out;
-	}
-#endif
-	read_lock(&tasklist_lock);
-	child = find_task_by_pid(pid);
-	if (child)
-		get_task_struct(child);
-	read_unlock(&tasklist_lock);
 
-	if (!child) {
-		pt_error_return(regs, ESRCH);
+	child = ptrace_get_task_struct(pid);
+	if (IS_ERR(child)) {
+		ret = PTR_ERR(child);
+		pt_error_return(regs, -ret);
 		goto out;
 	}
 
diff --git a/arch/x86_64/ia32/ptrace32.c b/arch/x86_64/ia32/ptrace32.c
index 2a925e2af3..5f4cdfa569 100644
--- a/arch/x86_64/ia32/ptrace32.c
+++ b/arch/x86_64/ia32/ptrace32.c
@@ -196,36 +196,6 @@ static int getreg32(struct task_struct *child, unsigned regno, u32 *val)
 
 #undef R32
 
-static struct task_struct *find_target(int request, int pid, int *err)
-{ 
-	struct task_struct *child;
-
-	*err = -EPERM; 
-	if (pid == 1)
-		return NULL; 
-
-	*err = -ESRCH;
-	read_lock(&tasklist_lock);
-	child = find_task_by_pid(pid);
-	if (child)
-		get_task_struct(child);
-	read_unlock(&tasklist_lock);
-	if (child) { 
-		*err = -EPERM;
-		if (child->pid == 1) 
-			goto out;
-		*err = ptrace_check_attach(child, request == PTRACE_KILL); 
-		if (*err < 0) 
-			goto out;
-		return child; 
-	} 
- out:
-	if (child)
-	put_task_struct(child);
-	return NULL; 
-	
-} 
-
 asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)
 {
 	struct task_struct *child;
@@ -254,9 +224,16 @@ asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)
 		break;
 	} 
 
-	child = find_target(request, pid, &ret);
-	if (!child)
-		return ret;
+	if (request == PTRACE_TRACEME)
+		return ptrace_traceme();
+
+	child = ptrace_get_task_struct(pid);
+	if (IS_ERR(child))
+		return PTR_ERR(child);
+
+	ret = ptrace_check_attach(child, request == PTRACE_KILL);
+	if (ret < 0)
+		goto out;
 
 	childregs = (struct pt_regs *)(child->thread.rsp0 - sizeof(struct pt_regs)); 
 
@@ -373,6 +350,7 @@ asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)
 		break;
 	}
 
+ out:
 	put_task_struct(child);
 	return ret;
 }
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index b2b3dba129..864791996b 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -80,6 +80,8 @@
 
 
 extern long arch_ptrace(struct task_struct *child, long request, long addr, long data);
+extern struct task_struct *ptrace_get_task_struct(pid_t pid);
+extern int ptrace_traceme(void);
 extern int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len);
 extern int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long dst, int len);
 extern int ptrace_attach(struct task_struct *tsk);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 656476eedb..cceaf09ac4 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -408,54 +408,62 @@ int ptrace_request(struct task_struct *child, long request,
 	return ret;
 }
 
-#ifndef __ARCH_SYS_PTRACE
-static int ptrace_get_task_struct(long request, long pid,
-		struct task_struct **childp)
+/**
+ * ptrace_traceme  --  helper for PTRACE_TRACEME
+ *
+ * Performs checks and sets PT_PTRACED.
+ * Should be used by all ptrace implementations for PTRACE_TRACEME.
+ */
+int ptrace_traceme(void)
 {
-	struct task_struct *child;
 	int ret;
 
 	/*
-	 * Callers use child == NULL as an indication to exit early even
-	 * when the return value is 0, so make sure it is non-NULL here.
+	 * Are we already being traced?
+	 */
+	if (current->ptrace & PT_PTRACED)
+		return -EPERM;
+	ret = security_ptrace(current->parent, current);
+	if (ret)
+		return -EPERM;
+	/*
+	 * Set the ptrace bit in the process ptrace flags.
 	 */
-	*childp = NULL;
+	current->ptrace |= PT_PTRACED;
+	return 0;
+}
 
-	if (request == PTRACE_TRACEME) {
-		/*
-		 * Are we already being traced?
-		 */
-		if (current->ptrace & PT_PTRACED)
-			return -EPERM;
-		ret = security_ptrace(current->parent, current);
-		if (ret)
-			return -EPERM;
-		/*
-		 * Set the ptrace bit in the process ptrace flags.
-		 */
-		current->ptrace |= PT_PTRACED;
-		return 0;
-	}
+/**
+ * ptrace_get_task_struct  --  grab a task struct reference for ptrace
+ * @pid:       process id to grab a task_struct reference of
+ *
+ * This function is a helper for ptrace implementations.  It checks
+ * permissions and then grabs a task struct for use of the actual
+ * ptrace implementation.
+ *
+ * Returns the task_struct for @pid or an ERR_PTR() on failure.
+ */
+struct task_struct *ptrace_get_task_struct(pid_t pid)
+{
+	struct task_struct *child;
 
 	/*
-	 * You may not mess with init
+	 * Tracing init is not allowed.
 	 */
 	if (pid == 1)
-		return -EPERM;
+		return ERR_PTR(-EPERM);
 
-	ret = -ESRCH;
 	read_lock(&tasklist_lock);
 	child = find_task_by_pid(pid);
 	if (child)
 		get_task_struct(child);
 	read_unlock(&tasklist_lock);
 	if (!child)
-		return -ESRCH;
-
-	*childp = child;
-	return 0;
+		return ERR_PTR(-ESRCH);
+	return child;
 }
 
+#ifndef __ARCH_SYS_PTRACE
 asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
 {
 	struct task_struct *child;
@@ -465,9 +473,16 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
 	 * This lock_kernel fixes a subtle race with suid exec
 	 */
 	lock_kernel();
-	ret = ptrace_get_task_struct(request, pid, &child);
-	if (!child)
+	if (request == PTRACE_TRACEME) {
+		ret = ptrace_traceme();
 		goto out;
+	}
+
+	child = ptrace_get_task_struct(pid);
+	if (IS_ERR(child)) {
+		ret = PTR_ERR(child);
+		goto out;
+	}
 
 	if (request == PTRACE_ATTACH) {
 		ret = ptrace_attach(child);
-- 
cgit v1.2.2


From b8b0af24353eafadf58a0889999700e43f135aad Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 8 Jan 2006 01:02:33 -0800
Subject: [PATCH] udf: remove bogus inode == NULL check in inode_bmap

inode can never be NULL when calling this function.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/udf/inode.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 4014f17d38..395e582ee5 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1957,11 +1957,6 @@ int8_t inode_bmap(struct inode *inode, int block, kernel_lb_addr *bloc, uint32_t
 		printk(KERN_ERR "udf: inode_bmap: block < 0\n");
 		return -1;
 	}
-	if (!inode)
-	{
-		printk(KERN_ERR "udf: inode_bmap: NULL inode\n");
-		return -1;
-	}
 
 	*extoffset = 0;
 	*elen = 0;
-- 
cgit v1.2.2


From 954de9141e75cf2f1ce69ccdbedc83ec827a01ec Mon Sep 17 00:00:00 2001
From: Samuel Thibault <samuel.thibault@ens-lyon.org>
Date: Sun, 8 Jan 2006 01:02:34 -0800
Subject: [PATCH] vgacon: fix doublescan mode

When doublescan mode is in use, scanlines must be doubled.

Thanks to Jason Dravet <dravet@hotmail.com> for reporting and testing.

Signed-off-by: Samuel Thibault <samuel.thibault@ens-lyon.org>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/video/console/vgacon.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/video/console/vgacon.c b/drivers/video/console/vgacon.c
index 167de397e4..f4e1c4b419 100644
--- a/drivers/video/console/vgacon.c
+++ b/drivers/video/console/vgacon.c
@@ -503,10 +503,16 @@ static int vgacon_doresize(struct vc_data *c,
 {
 	unsigned long flags;
 	unsigned int scanlines = height * c->vc_font.height;
-	u8 scanlines_lo, r7, vsync_end, mode;
+	u8 scanlines_lo, r7, vsync_end, mode, max_scan;
 
 	spin_lock_irqsave(&vga_lock, flags);
 
+	outb_p(VGA_CRTC_MAX_SCAN, vga_video_port_reg);
+	max_scan = inb_p(vga_video_port_val);
+
+	if (max_scan & 0x80)
+		scanlines <<= 1;
+
 	outb_p(VGA_CRTC_MODE, vga_video_port_reg);
 	mode = inb_p(vga_video_port_val);
 
-- 
cgit v1.2.2


From 53dbb26dbcf0d844967677633d7587c1d34740d5 Mon Sep 17 00:00:00 2001
From: "Antonino A. Daplas" <adaplas@gmail.com>
Date: Sun, 8 Jan 2006 01:02:36 -0800
Subject: [PATCH] vgacon: Workaround for resize bug in some chipsets

Reported from Redhat Bugzilla Bug 170450

"I updated to the development kernel and now during boot only the top of the
text is visable. For example the monitor screen the is the lines and I can
only see text in the asterisk area.
---
 drivers/video/console/vgacon.c | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/drivers/video/console/vgacon.c b/drivers/video/console/vgacon.c
index f4e1c4b419..12d9329d14 100644
--- a/drivers/video/console/vgacon.c
+++ b/drivers/video/console/vgacon.c
@@ -56,6 +56,8 @@
 static DEFINE_SPINLOCK(vga_lock);
 static int cursor_size_lastfrom;
 static int cursor_size_lastto;
+static u32 vgacon_xres;
+static u32 vgacon_yres;
 static struct vgastate state;
 
 #define BLANK 0x0020
@@ -69,7 +71,7 @@ static struct vgastate state;
  * appear.
  */
 #undef TRIDENT_GLITCH
-
+#define VGA_FONTWIDTH       8   /* VGA does not support fontwidths != 8 */
 /*
  *  Interface used by the world
  */
@@ -325,6 +327,10 @@ static const char __init *vgacon_startup(void)
 		vga_scan_lines =
 		    vga_video_font_height * vga_video_num_lines;
 	}
+
+	vgacon_xres = ORIG_VIDEO_COLS * VGA_FONTWIDTH;
+	vgacon_yres = vga_scan_lines;
+
 	return display_desc;
 }
 
@@ -513,6 +519,8 @@ static int vgacon_doresize(struct vc_data *c,
 	if (max_scan & 0x80)
 		scanlines <<= 1;
 
+	vgacon_xres = width * VGA_FONTWIDTH;
+	vgacon_yres = height * c->vc_font.height;
 	outb_p(VGA_CRTC_MODE, vga_video_port_reg);
 	mode = inb_p(vga_video_port_val);
 
@@ -557,6 +565,10 @@ static int vgacon_doresize(struct vc_data *c,
 
 static int vgacon_switch(struct vc_data *c)
 {
+	int x = c->vc_cols * VGA_FONTWIDTH;
+	int y = c->vc_rows * c->vc_font.height;
+	int rows = ORIG_VIDEO_LINES * vga_default_font_height/
+		c->vc_font.height;
 	/*
 	 * We need to save screen size here as it's the only way
 	 * we can spot the screen has been resized and we need to
@@ -572,10 +584,11 @@ static int vgacon_switch(struct vc_data *c)
 		scr_memcpyw((u16 *) c->vc_origin, (u16 *) c->vc_screenbuf,
 			    c->vc_screenbuf_size > vga_vram_size ?
 				vga_vram_size : c->vc_screenbuf_size);
-		if (!(vga_video_num_columns % 2) &&
-		    vga_video_num_columns <= ORIG_VIDEO_COLS &&
-		    vga_video_num_lines <= (ORIG_VIDEO_LINES *
-			vga_default_font_height) / c->vc_font.height)
+
+		if ((vgacon_xres != x || vgacon_yres != y) &&
+		    (!(vga_video_num_columns % 2) &&
+		     vga_video_num_columns <= ORIG_VIDEO_COLS &&
+		     vga_video_num_lines <= rows))
 			vgacon_doresize(c, c->vc_cols, c->vc_rows);
 	}
 
@@ -999,7 +1012,8 @@ static int vgacon_font_set(struct vc_data *c, struct console_font *font, unsigne
 	if (vga_video_type < VIDEO_TYPE_EGAM)
 		return -EINVAL;
 
-	if (font->width != 8 || (charcount != 256 && charcount != 512))
+	if (font->width != VGA_FONTWIDTH ||
+	    (charcount != 256 && charcount != 512))
 		return -EINVAL;
 
 	rc = vgacon_do_font_op(&state, font->data, 1, charcount == 512);
@@ -1016,7 +1030,7 @@ static int vgacon_font_get(struct vc_data *c, struct console_font *font)
 	if (vga_video_type < VIDEO_TYPE_EGAM)
 		return -EINVAL;
 
-	font->width = 8;
+	font->width = VGA_FONTWIDTH;
 	font->height = c->vc_font.height;
 	font->charcount = vga_512_chars ? 512 : 256;
 	if (!font->data)
-- 
cgit v1.2.2


From 788540141f4549637e89aadca6e25cf25eb53383 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sun, 8 Jan 2006 01:02:37 -0800
Subject: [PATCH] Permit multiple inclusion of linux/pagevec.h

Make it possible to include linux/pagevec.h multiple times without
incurring errors due to duplicate definitions.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/pagevec.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index def32c5715..8eb7fa76c1 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -5,6 +5,9 @@
  * pages.  A pagevec is a multipage container which is used for that.
  */
 
+#ifndef _LINUX_PAGEVEC_H
+#define _LINUX_PAGEVEC_H
+
 /* 14 pointers + two long's align the pagevec structure to a power of two */
 #define PAGEVEC_SIZE	14
 
@@ -83,3 +86,5 @@ static inline void pagevec_lru_add(struct pagevec *pvec)
 	if (pagevec_count(pvec))
 		__pagevec_lru_add(pvec);
 }
+
+#endif /* _LINUX_PAGEVEC_H */
-- 
cgit v1.2.2


From 4a30131e7dbb17e5fec6958bfac9da9aff1fa29b Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Sun, 8 Jan 2006 01:02:39 -0800
Subject: [PATCH] Fix some problems with truncate and mtime semantics.

SUS requires that when truncating a file to the size that it currently
is:
  truncate and ftruncate should NOT modify ctime or mtime
  O_TRUNC SHOULD modify ctime and mtime.

Currently mtime and ctime are always modified on most local
filesystems (side effect of ->truncate) or never modified (on NFS).

With this patch:
  ATTR_CTIME|ATTR_MTIME are sent with ATTR_SIZE precisely when
    an update of these times is required whether size changes or not
    (via a new argument to do_truncate).  This allows NFS to do
    the right thing for O_TRUNC.
  inode_setattr nolonger forces ATTR_MTIME|ATTR_CTIME when the ATTR_SIZE
    sets the size to it's current value.  This allows local filesystems
    to do the right thing for f?truncate.

Also, the logic in inode_setattr is changed a bit so there are two return
points.  One returns the error from vmtruncate if it failed, the other
returns 0 (there can be no other failure).

Finally, if vmtruncate succeeds, and ATTR_SIZE is the only change
requested, we now fall-through and mark_inode_dirty.  If a filesystem did
not have a ->truncate function, then vmtruncate will have changed i_size,
without marking the inode as 'dirty', and I think this is wrong.

Signed-off-by: Neil Brown <neilb@suse.de>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/attr.c          | 24 ++++++++----------------
 fs/exec.c          |  2 +-
 fs/namei.c         |  2 +-
 fs/open.c          |  9 +++++----
 include/linux/fs.h |  3 ++-
 5 files changed, 17 insertions(+), 23 deletions(-)

diff --git a/fs/attr.c b/fs/attr.c
index 67bcd9b14e..b34732506f 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -67,20 +67,12 @@ EXPORT_SYMBOL(inode_change_ok);
 int inode_setattr(struct inode * inode, struct iattr * attr)
 {
 	unsigned int ia_valid = attr->ia_valid;
-	int error = 0;
-
-	if (ia_valid & ATTR_SIZE) {
-		if (attr->ia_size != i_size_read(inode)) {
-			error = vmtruncate(inode, attr->ia_size);
-			if (error || (ia_valid == ATTR_SIZE))
-				goto out;
-		} else {
-			/*
-			 * We skipped the truncate but must still update
-			 * timestamps
-			 */
-			ia_valid |= ATTR_MTIME|ATTR_CTIME;
-		}
+
+	if (ia_valid & ATTR_SIZE &&
+	    attr->ia_size != i_size_read(inode)) {
+		int error = vmtruncate(inode, attr->ia_size);
+		if (error)
+			return error;
 	}
 
 	if (ia_valid & ATTR_UID)
@@ -104,8 +96,8 @@ int inode_setattr(struct inode * inode, struct iattr * attr)
 		inode->i_mode = mode;
 	}
 	mark_inode_dirty(inode);
-out:
-	return error;
+
+	return 0;
 }
 EXPORT_SYMBOL(inode_setattr);
 
diff --git a/fs/exec.c b/fs/exec.c
index e9650cd22a..2075b674d8 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1505,7 +1505,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 		goto close_fail;
 	if (!file->f_op->write)
 		goto close_fail;
-	if (do_truncate(file->f_dentry, 0, file) != 0)
+	if (do_truncate(file->f_dentry, 0, 0, file) != 0)
 		goto close_fail;
 
 	retval = binfmt->core_dump(signr, regs, file);
diff --git a/fs/namei.c b/fs/namei.c
index 6dbbd42d8b..300eae088d 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1491,7 +1491,7 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
 		if (!error) {
 			DQUOT_INIT(inode);
 			
-			error = do_truncate(dentry, 0, NULL);
+			error = do_truncate(dentry, 0, ATTR_MTIME|ATTR_CTIME, NULL);
 		}
 		put_write_access(inode);
 		if (error)
diff --git a/fs/open.c b/fs/open.c
index f53a5b9ffb..94968cb3af 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -194,7 +194,8 @@ out:
 	return error;
 }
 
-int do_truncate(struct dentry *dentry, loff_t length, struct file *filp)
+int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
+	struct file *filp)
 {
 	int err;
 	struct iattr newattrs;
@@ -204,7 +205,7 @@ int do_truncate(struct dentry *dentry, loff_t length, struct file *filp)
 		return -EINVAL;
 
 	newattrs.ia_size = length;
-	newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
+	newattrs.ia_valid = ATTR_SIZE | time_attrs;
 	if (filp) {
 		newattrs.ia_file = filp;
 		newattrs.ia_valid |= ATTR_FILE;
@@ -266,7 +267,7 @@ static inline long do_sys_truncate(const char __user * path, loff_t length)
 	error = locks_verify_truncate(inode, NULL, length);
 	if (!error) {
 		DQUOT_INIT(inode);
-		error = do_truncate(nd.dentry, length, NULL);
+		error = do_truncate(nd.dentry, length, 0, NULL);
 	}
 	put_write_access(inode);
 
@@ -318,7 +319,7 @@ static inline long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
 
 	error = locks_verify_truncate(inode, file, length);
 	if (!error)
-		error = do_truncate(dentry, length, file);
+		error = do_truncate(dentry, length, 0, file);
 out_putf:
 	fput(file);
 out:
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ef29500b5d..74c01aabd4 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1344,7 +1344,8 @@ static inline int break_lease(struct inode *inode, unsigned int mode)
 
 /* fs/open.c */
 
-extern int do_truncate(struct dentry *, loff_t start, struct file *filp);
+extern int do_truncate(struct dentry *, loff_t start, unsigned int time_attrs,
+		       struct file *filp);
 extern long do_sys_open(const char __user *filename, int flags, int mode);
 extern struct file *filp_open(const char *, int, int);
 extern struct file * dentry_open(struct dentry *, struct vfsmount *, int);
-- 
cgit v1.2.2


From 2520f14ca85e38f575eed6acc6e586df246abea6 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Sun, 8 Jan 2006 01:02:40 -0800
Subject: [PATCH] Fix overflow tests for compat_sys_fcntl64 locking

When making an fctl locking call through compat_sys_fcntl64 (i.e.  a 32bit
app on a 64bit kernel), the syscall can return a locking range that is in
conflict with the queried lock.

If some aspect of this range does not fit in the 32bit structure, something
needs to be done.

The current code is wrong in several respects:

- It returns data to userspace even if no conflict was found
   i.e. it should check l_type for F_UNLCK
- It returns -EOVERFLOW too agressively.   A lock range covering
  the last possible byte of the file (start = COMPAT_OFF_T_MAX,
  len = 1) should be possible, but is rejected with the current test.
- A extra-long 'len' should not be a problem.  If only that part
  of the conflicting lock that would be visible to the 32bit
  app needs to be reported to the 32bit app anyway.

This patch addresses those three issues and adds a comment to (hopefully)
record it for posterity.

Note: this patch mainly affects test-cases.  Real applications rarely is
ever see the problems.

This patch has been tested (LSB test suite), and works.

Signed-off-by: Neil Brown <neilb@suse.de>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Matthew Wilcox <willy@debian.org>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/compat.c | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/fs/compat.c b/fs/compat.c
index 55ac0324aa..271b75d159 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -494,9 +494,21 @@ asmlinkage long compat_sys_fcntl64(unsigned int fd, unsigned int cmd,
 		ret = sys_fcntl(fd, cmd, (unsigned long)&f);
 		set_fs(old_fs);
 		if (cmd == F_GETLK && ret == 0) {
-			if ((f.l_start >= COMPAT_OFF_T_MAX) ||
-			    ((f.l_start + f.l_len) > COMPAT_OFF_T_MAX))
+			/* GETLK was successfule and we need to return the data...
+			 * but it needs to fit in the compat structure.
+			 * l_start shouldn't be too big, unless the original
+			 * start + end is greater than COMPAT_OFF_T_MAX, in which
+			 * case the app was asking for trouble, so we return
+			 * -EOVERFLOW in that case.
+			 * l_len could be too big, in which case we just truncate it,
+			 * and only allow the app to see that part of the conflicting
+			 * lock that might make sense to it anyway
+			 */
+
+			if (f.l_start > COMPAT_OFF_T_MAX)
 				ret = -EOVERFLOW;
+			if (f.l_len > COMPAT_OFF_T_MAX)
+				f.l_len = COMPAT_OFF_T_MAX;
 			if (ret == 0)
 				ret = put_compat_flock(&f, compat_ptr(arg));
 		}
@@ -515,9 +527,11 @@ asmlinkage long compat_sys_fcntl64(unsigned int fd, unsigned int cmd,
 				(unsigned long)&f);
 		set_fs(old_fs);
 		if (cmd == F_GETLK64 && ret == 0) {
-			if ((f.l_start >= COMPAT_LOFF_T_MAX) ||
-			    ((f.l_start + f.l_len) > COMPAT_LOFF_T_MAX))
+			/* need to return lock information - see above for commentary */
+			if (f.l_start > COMPAT_LOFF_T_MAX)
 				ret = -EOVERFLOW;
+			if (f.l_len > COMPAT_LOFF_T_MAX)
+				f.l_len = COMPAT_LOFF_T_MAX;
 			if (ret == 0)
 				ret = put_compat_flock64(&f, compat_ptr(arg));
 		}
-- 
cgit v1.2.2


From 025510cd20f4c35c3958bea133d96c9bd7c6ef9e Mon Sep 17 00:00:00 2001
From: Guillaume Chazarain <guichaz@yahoo.fr>
Date: Sun, 8 Jan 2006 01:02:41 -0800
Subject: [PATCH] printk return value: fix it

What's the true meaning of the printk return value?  Should it include the
priority prefix length of 3?  and what about the timing information?  In
both cases it was broken:

strace -e write echo 1 > /dev/kmsg
=> write(1, "1\n", 2)                      = 5
strace -e write echo "<1>1" > /dev/kmsg
=> write(1, "<1>1\n", 5)                   = 8

The returned length was "length of input string + 3", I made it "length
of string output to the log buffer".

Note that I couldn't find any printk caller in the kernel interested by its
return value besides kmsg_write.

Signed-off-by: Guillaume Chazarain <guichaz@yahoo.fr>
Acked-By: Tim Bird <tim.bird@am.sony.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/printk.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/printk.c b/kernel/printk.c
index 5287be83e3..2251be80cd 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -569,7 +569,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 				   p[1] <= '7' && p[2] == '>') {
 					loglev_char = p[1];
 					p += 3;
-					printed_len += 3;
+					printed_len -= 3;
 				} else {
 					loglev_char = default_message_loglevel
 						+ '0';
@@ -584,7 +584,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 
 				for (tp = tbuf; tp < tbuf + tlen; tp++)
 					emit_log_char(*tp);
-				printed_len += tlen - 3;
+				printed_len += tlen;
 			} else {
 				if (p[0] != '<' || p[1] < '0' ||
 				   p[1] > '7' || p[2] != '>') {
@@ -592,8 +592,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 					emit_log_char(default_message_loglevel
 						+ '0');
 					emit_log_char('>');
+					printed_len += 3;
 				}
-				printed_len += 3;
 			}
 			log_level_unknown = 0;
 			if (!*p)
-- 
cgit v1.2.2


From cd140a5c1f456f50897af4a2e9a23d228a5fe719 Mon Sep 17 00:00:00 2001
From: Guillaume Chazarain <guichaz@yahoo.fr>
Date: Sun, 8 Jan 2006 01:02:43 -0800
Subject: [PATCH] kmsg_write: don't return printk return value

kmsg_write returns with printk, so some programs may be confused by a
successful write() with a return value different than the buffer length.

# /bin/echo something > /dev/kmsg
/bin/echo: write error: Inappropriate ioctl for device

The drawbacks is that the printk return value can no more be quickly
checked from userspace.

Signed-off-by: Guillaume Chazarain <guichaz@yahoo.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/mem.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 91dd669273..a85f3a3614 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -817,7 +817,7 @@ static ssize_t kmsg_write(struct file * file, const char __user * buf,
 			  size_t count, loff_t *ppos)
 {
 	char *tmp;
-	int ret;
+	ssize_t ret;
 
 	tmp = kmalloc(count + 1, GFP_KERNEL);
 	if (tmp == NULL)
@@ -826,6 +826,9 @@ static ssize_t kmsg_write(struct file * file, const char __user * buf,
 	if (!copy_from_user(tmp, buf, count)) {
 		tmp[count] = 0;
 		ret = printk("%s", tmp);
+		if (ret > count)
+			/* printk can add a prefix */
+			ret = count;
 	}
 	kfree(tmp);
 	return ret;
-- 
cgit v1.2.2


From 017679c4d45783158dba1dd6f79e712c22bb3d9a Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sun, 8 Jan 2006 01:02:43 -0800
Subject: [PATCH] keys: Permit key expiry time to be set

Add a new keyctl function that allows the expiry time to be set on a key or
removed from a key, provided the caller has attribute modification access.

Signed-off-by: David Howells <dhowells@redhat.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Alexander Zangerl <az@bond.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/keys.txt   | 15 ++++++++++++++-
 include/linux/keyctl.h   |  1 +
 security/keys/compat.c   |  3 +++
 security/keys/internal.h |  1 +
 security/keys/keyctl.c   | 44 ++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 63 insertions(+), 1 deletion(-)

diff --git a/Documentation/keys.txt b/Documentation/keys.txt
index 6304db59bf..c17c4ca743 100644
--- a/Documentation/keys.txt
+++ b/Documentation/keys.txt
@@ -498,7 +498,7 @@ The keyctl syscall functions are:
      keyring is full, error ENFILE will result.
 
      The link procedure checks the nesting of the keyrings, returning ELOOP if
-     it appears to deep or EDEADLK if the link would introduce a cycle.
+     it appears too deep or EDEADLK if the link would introduce a cycle.
 
 
  (*) Unlink a key or keyring from another keyring:
@@ -628,6 +628,19 @@ The keyctl syscall functions are:
      there is one, otherwise the user default session keyring.
 
 
+ (*) Set the timeout on a key.
+
+	long keyctl(KEYCTL_SET_TIMEOUT, key_serial_t key, unsigned timeout);
+
+     This sets or clears the timeout on a key. The timeout can be 0 to clear
+     the timeout or a number of seconds to set the expiry time that far into
+     the future.
+
+     The process must have attribute modification access on a key to set its
+     timeout. Timeouts may not be set with this function on negative, revoked
+     or expired keys.
+
+
 ===============
 KERNEL SERVICES
 ===============
diff --git a/include/linux/keyctl.h b/include/linux/keyctl.h
index 8d7c59a29e..ec8f3d622a 100644
--- a/include/linux/keyctl.h
+++ b/include/linux/keyctl.h
@@ -46,5 +46,6 @@
 #define KEYCTL_INSTANTIATE		12	/* instantiate a partially constructed key */
 #define KEYCTL_NEGATE			13	/* negate a partially constructed key */
 #define KEYCTL_SET_REQKEY_KEYRING	14	/* set default request-key keyring */
+#define KEYCTL_SET_TIMEOUT		15	/* set key timeout */
 
 #endif /*  _LINUX_KEYCTL_H */
diff --git a/security/keys/compat.c b/security/keys/compat.c
index 3303673c63..e8e7ef4a29 100644
--- a/security/keys/compat.c
+++ b/security/keys/compat.c
@@ -74,6 +74,9 @@ asmlinkage long compat_sys_keyctl(u32 option,
 	case KEYCTL_SET_REQKEY_KEYRING:
 		return keyctl_set_reqkey_keyring(arg2);
 
+	case KEYCTL_SET_TIMEOUT:
+		return keyctl_set_timeout(arg2, arg3);
+
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/security/keys/internal.h b/security/keys/internal.h
index 39cba97c5e..51f37c0bdb 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -136,6 +136,7 @@ extern long keyctl_instantiate_key(key_serial_t, const void __user *,
 				   size_t, key_serial_t);
 extern long keyctl_negate_key(key_serial_t, unsigned, key_serial_t);
 extern long keyctl_set_reqkey_keyring(int);
+extern long keyctl_set_timeout(key_serial_t, unsigned);
 
 
 /*
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index b7a468fabd..299f0ae11c 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -965,6 +965,46 @@ long keyctl_set_reqkey_keyring(int reqkey_defl)
 
 } /* end keyctl_set_reqkey_keyring() */
 
+/*****************************************************************************/
+/*
+ * set or clear the timeout for a key
+ */
+long keyctl_set_timeout(key_serial_t id, unsigned timeout)
+{
+	struct timespec now;
+	struct key *key;
+	key_ref_t key_ref;
+	time_t expiry;
+	long ret;
+
+	key_ref = lookup_user_key(NULL, id, 1, 1, KEY_SETATTR);
+	if (IS_ERR(key_ref)) {
+		ret = PTR_ERR(key_ref);
+		goto error;
+	}
+
+	key = key_ref_to_ptr(key_ref);
+
+	/* make the changes with the locks held to prevent races */
+	down_write(&key->sem);
+
+	expiry = 0;
+	if (timeout > 0) {
+		now = current_kernel_time();
+		expiry = now.tv_sec + timeout;
+	}
+
+	key->expiry = expiry;
+
+	up_write(&key->sem);
+	key_put(key);
+
+	ret = 0;
+error:
+	return ret;
+
+} /* end keyctl_set_timeout() */
+
 /*****************************************************************************/
 /*
  * the key control system call
@@ -1038,6 +1078,10 @@ asmlinkage long sys_keyctl(int option, unsigned long arg2, unsigned long arg3,
 	case KEYCTL_SET_REQKEY_KEYRING:
 		return keyctl_set_reqkey_keyring(arg2);
 
+	case KEYCTL_SET_TIMEOUT:
+		return keyctl_set_timeout((key_serial_t) arg2,
+					  (unsigned) arg3);
+
 	default:
 		return -EOPNOTSUPP;
 	}
-- 
cgit v1.2.2


From cab8eb594e84b434d20412fc5a3985b0bee3ab9f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sun, 8 Jan 2006 01:02:45 -0800
Subject: [PATCH] keys: Discard duplicate keys from a keyring on link

Cause any links within a keyring to keys that match a key to be linked into
that keyring to be discarded as a link to the new key is added.  The match is
contingent on the type and description strings being the same.

This permits requests, adds and searches to displace negative, expired,
revoked and dead keys easily.  After some discussion it was concluded that
duplicate valid keys should probably be discarded also as they would otherwise
hide the new key.

Since request_key() is intended to be the primary method by which keys are
added to a keyring, duplicate valid keys wouldn't be an issue there as that
function would return an existing match in preference to creating a new key.

Signed-off-by: David Howells <dhowells@redhat.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Alexander Zangerl <az@bond.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/keys.txt  |  4 +++
 security/keys/keyring.c | 87 ++++++++++++++++++++++++++++++++++++-------------
 2 files changed, 68 insertions(+), 23 deletions(-)

diff --git a/Documentation/keys.txt b/Documentation/keys.txt
index c17c4ca743..eeda00f82d 100644
--- a/Documentation/keys.txt
+++ b/Documentation/keys.txt
@@ -500,6 +500,10 @@ The keyctl syscall functions are:
      The link procedure checks the nesting of the keyrings, returning ELOOP if
      it appears too deep or EDEADLK if the link would introduce a cycle.
 
+     Any links within the keyring to keys that match the new key in terms of
+     type and description will be discarded from the keyring as the new one is
+     added.
+
 
  (*) Unlink a key or keyring from another keyring:
 
diff --git a/security/keys/keyring.c b/security/keys/keyring.c
index 5d22c0388b..09d92d52ef 100644
--- a/security/keys/keyring.c
+++ b/security/keys/keyring.c
@@ -682,17 +682,33 @@ static void keyring_link_rcu_disposal(struct rcu_head *rcu)
 
 } /* end keyring_link_rcu_disposal() */
 
+/*****************************************************************************/
+/*
+ * dispose of a keyring list after the RCU grace period, freeing the unlinked
+ * key
+ */
+static void keyring_unlink_rcu_disposal(struct rcu_head *rcu)
+{
+	struct keyring_list *klist =
+		container_of(rcu, struct keyring_list, rcu);
+
+	key_put(klist->keys[klist->delkey]);
+	kfree(klist);
+
+} /* end keyring_unlink_rcu_disposal() */
+
 /*****************************************************************************/
 /*
  * link a key into to a keyring
  * - must be called with the keyring's semaphore write-locked
+ * - discard already extant link to matching key if there is one
  */
 int __key_link(struct key *keyring, struct key *key)
 {
 	struct keyring_list *klist, *nklist;
 	unsigned max;
 	size_t size;
-	int ret;
+	int loop, ret;
 
 	ret = -EKEYREVOKED;
 	if (test_bit(KEY_FLAG_REVOKED, &keyring->flags))
@@ -714,6 +730,48 @@ int __key_link(struct key *keyring, struct key *key)
 			goto error2;
 	}
 
+	/* see if there's a matching key we can displace */
+	klist = keyring->payload.subscriptions;
+
+	if (klist && klist->nkeys > 0) {
+		struct key_type *type = key->type;
+
+		for (loop = klist->nkeys - 1; loop >= 0; loop--) {
+			if (klist->keys[loop]->type == type &&
+			    strcmp(klist->keys[loop]->description,
+				   key->description) == 0
+			    ) {
+				/* found a match - replace with new key */
+				size = sizeof(struct key *) * klist->maxkeys;
+				size += sizeof(*klist);
+				BUG_ON(size > PAGE_SIZE);
+
+				ret = -ENOMEM;
+				nklist = kmalloc(size, GFP_KERNEL);
+				if (!nklist)
+					goto error2;
+
+				memcpy(nklist, klist, size);
+
+				/* replace matched key */
+				atomic_inc(&key->usage);
+				nklist->keys[loop] = key;
+
+				rcu_assign_pointer(
+					keyring->payload.subscriptions,
+					nklist);
+
+				/* dispose of the old keyring list and the
+				 * displaced key */
+				klist->delkey = loop;
+				call_rcu(&klist->rcu,
+					 keyring_unlink_rcu_disposal);
+
+				goto done;
+			}
+		}
+	}
+
 	/* check that we aren't going to overrun the user's quota */
 	ret = key_payload_reserve(keyring,
 				  keyring->datalen + KEYQUOTA_LINK_BYTES);
@@ -730,8 +788,6 @@ int __key_link(struct key *keyring, struct key *key)
 		smp_wmb();
 		klist->nkeys++;
 		smp_wmb();
-
-		ret = 0;
 	}
 	else {
 		/* grow the key list */
@@ -769,16 +825,16 @@ int __key_link(struct key *keyring, struct key *key)
 		/* dispose of the old keyring list */
 		if (klist)
 			call_rcu(&klist->rcu, keyring_link_rcu_disposal);
-
-		ret = 0;
 	}
 
- error2:
+done:
+	ret = 0;
+error2:
 	up_write(&keyring_serialise_link_sem);
- error:
+error:
 	return ret;
 
- error3:
+error3:
 	/* undo the quota changes */
 	key_payload_reserve(keyring,
 			    keyring->datalen - KEYQUOTA_LINK_BYTES);
@@ -807,21 +863,6 @@ int key_link(struct key *keyring, struct key *key)
 
 EXPORT_SYMBOL(key_link);
 
-/*****************************************************************************/
-/*
- * dispose of a keyring list after the RCU grace period, freeing the unlinked
- * key
- */
-static void keyring_unlink_rcu_disposal(struct rcu_head *rcu)
-{
-	struct keyring_list *klist =
-		container_of(rcu, struct keyring_list, rcu);
-
-	key_put(klist->keys[klist->delkey]);
-	kfree(klist);
-
-} /* end keyring_unlink_rcu_disposal() */
-
 /*****************************************************************************/
 /*
  * unlink the first link to a key from a keyring
-- 
cgit v1.2.2


From b5f545c880a2a47947ba2118b2509644ab7a2969 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sun, 8 Jan 2006 01:02:47 -0800
Subject: [PATCH] keys: Permit running process to instantiate keys

Make it possible for a running process (such as gssapid) to be able to
instantiate a key, as was requested by Trond Myklebust for NFS4.

The patch makes the following changes:

 (1) A new, optional key type method has been added. This permits a key type
     to intercept requests at the point /sbin/request-key is about to be
     spawned and do something else with them - passing them over the
     rpc_pipefs files or netlink sockets for instance.

     The uninstantiated key, the authorisation key and the intended operation
     name are passed to the method.

 (2) The callout_info is no longer passed as an argument to /sbin/request-key
     to prevent unauthorised viewing of this data using ps or by looking in
     /proc/pid/cmdline.

     This means that the old /sbin/request-key program will not work with the
     patched kernel as it will expect to see an extra argument that is no
     longer there.

     A revised keyutils package will be made available tomorrow.

 (3) The callout_info is now attached to the authorisation key. Reading this
     key will retrieve the information.

 (4) A new field has been added to the task_struct. This holds the
     authorisation key currently active for a thread. Searches now look here
     for the caller's set of keys rather than looking for an auth key in the
     lowest level of the session keyring.

     This permits a thread to be servicing multiple requests at once and to
     switch between them. Note that this is per-thread, not per-process, and
     so is usable in multithreaded programs.

     The setting of this field is inherited across fork and exec.

 (5) A new keyctl function (KEYCTL_ASSUME_AUTHORITY) has been added that
     permits a thread to assume the authority to deal with an uninstantiated
     key. Assumption is only permitted if the authorisation key associated
     with the uninstantiated key is somewhere in the thread's keyrings.

     This function can also clear the assumption.

 (6) A new magic key specifier has been added to refer to the currently
     assumed authorisation key (KEY_SPEC_REQKEY_AUTH_KEY).

 (7) Instantiation will only proceed if the appropriate authorisation key is
     assumed first. The assumed authorisation key is discarded if
     instantiation is successful.

 (8) key_validate() is moved from the file of request_key functions to the
     file of permissions functions.

 (9) The documentation is updated.

From: <Valdis.Kletnieks@vt.edu>

    Build fix.

Signed-off-by: David Howells <dhowells@redhat.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Alexander Zangerl <az@bond.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/keys-request-key.txt |  22 +++--
 Documentation/keys.txt             |  24 +++++
 include/linux/key.h                |  12 +++
 include/linux/keyctl.h             |   2 +
 include/linux/sched.h              |   1 +
 security/keys/compat.c             |   3 +
 security/keys/internal.h           |   4 +-
 security/keys/keyctl.c             | 107 ++++++++++++++++-----
 security/keys/keyring.c            |  45 ---------
 security/keys/permission.c         |  32 +++++++
 security/keys/process_keys.c       |  71 +++++++-------
 security/keys/request_key.c        | 108 ++++++++++-----------
 security/keys/request_key_auth.c   | 192 ++++++++++++++++++++++---------------
 13 files changed, 378 insertions(+), 245 deletions(-)

diff --git a/Documentation/keys-request-key.txt b/Documentation/keys-request-key.txt
index 5f2b9c5edb..22488d7911 100644
--- a/Documentation/keys-request-key.txt
+++ b/Documentation/keys-request-key.txt
@@ -56,10 +56,12 @@ A request proceeds in the following manner:
  (4) request_key() then forks and executes /sbin/request-key with a new session
      keyring that contains a link to auth key V.
 
- (5) /sbin/request-key execs an appropriate program to perform the actual
+ (5) /sbin/request-key assumes the authority associated with key U.
+
+ (6) /sbin/request-key execs an appropriate program to perform the actual
      instantiation.
 
- (6) The program may want to access another key from A's context (say a
+ (7) The program may want to access another key from A's context (say a
      Kerberos TGT key). It just requests the appropriate key, and the keyring
      search notes that the session keyring has auth key V in its bottom level.
 
@@ -67,19 +69,19 @@ A request proceeds in the following manner:
      UID, GID, groups and security info of process A as if it was process A,
      and come up with key W.
 
- (7) The program then does what it must to get the data with which to
+ (8) The program then does what it must to get the data with which to
      instantiate key U, using key W as a reference (perhaps it contacts a
      Kerberos server using the TGT) and then instantiates key U.
 
- (8) Upon instantiating key U, auth key V is automatically revoked so that it
+ (9) Upon instantiating key U, auth key V is automatically revoked so that it
      may not be used again.
 
- (9) The program then exits 0 and request_key() deletes key V and returns key
+(10) The program then exits 0 and request_key() deletes key V and returns key
      U to the caller.
 
-This also extends further. If key W (step 5 above) didn't exist, key W would be
-created uninstantiated, another auth key (X) would be created [as per step 3]
-and another copy of /sbin/request-key spawned [as per step 4]; but the context
+This also extends further. If key W (step 7 above) didn't exist, key W would be
+created uninstantiated, another auth key (X) would be created (as per step 3)
+and another copy of /sbin/request-key spawned (as per step 4); but the context
 specified by auth key X will still be process A, as it was in auth key V.
 
 This is because process A's keyrings can't simply be attached to
@@ -138,8 +140,8 @@ until one succeeds:
 
  (3) The process's session keyring is searched.
 
- (4) If the process has a request_key() authorisation key in its session
-     keyring then:
+ (4) If the process has assumed the authority associated with a request_key()
+     authorisation key then:
 
      (a) If extant, the calling process's thread keyring is searched.
 
diff --git a/Documentation/keys.txt b/Documentation/keys.txt
index eeda00f82d..aaa01b0e3e 100644
--- a/Documentation/keys.txt
+++ b/Documentation/keys.txt
@@ -308,6 +308,8 @@ process making the call:
 	KEY_SPEC_USER_KEYRING		-4	UID-specific keyring
 	KEY_SPEC_USER_SESSION_KEYRING	-5	UID-session keyring
 	KEY_SPEC_GROUP_KEYRING		-6	GID-specific keyring
+	KEY_SPEC_REQKEY_AUTH_KEY	-7	assumed request_key()
+						  authorisation key
 
 
 The main syscalls are:
@@ -645,6 +647,28 @@ The keyctl syscall functions are:
      or expired keys.
 
 
+ (*) Assume the authority granted to instantiate a key
+
+	long keyctl(KEYCTL_ASSUME_AUTHORITY, key_serial_t key);
+
+     This assumes or divests the authority required to instantiate the
+     specified key. Authority can only be assumed if the thread has the
+     authorisation key associated with the specified key in its keyrings
+     somewhere.
+
+     Once authority is assumed, searches for keys will also search the
+     requester's keyrings using the requester's security label, UID, GID and
+     groups.
+
+     If the requested authority is unavailable, error EPERM will be returned,
+     likewise if the authority has been revoked because the target key is
+     already instantiated.
+
+     If the specified key is 0, then any assumed authority will be divested.
+
+     The assumed authorititive key is inherited across fork and exec.
+
+
 ===============
 KERNEL SERVICES
 ===============
diff --git a/include/linux/key.h b/include/linux/key.h
index 4d189e51bc..cbf464ad95 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -177,6 +177,8 @@ struct key {
 /*
  * kernel managed key type definition
  */
+typedef int (*request_key_actor_t)(struct key *key, struct key *authkey, const char *op);
+
 struct key_type {
 	/* name of the type */
 	const char *name;
@@ -218,6 +220,16 @@ struct key_type {
 	 */
 	long (*read)(const struct key *key, char __user *buffer, size_t buflen);
 
+	/* handle request_key() for this type instead of invoking
+	 * /sbin/request-key (optional)
+	 * - key is the key to instantiate
+	 * - authkey is the authority to assume when instantiating this key
+	 * - op is the operation to be done, usually "create"
+	 * - the call must not return until the instantiation process has run
+	 *   its course
+	 */
+	request_key_actor_t request_key;
+
 	/* internal fields */
 	struct list_head	link;		/* link in types list */
 };
diff --git a/include/linux/keyctl.h b/include/linux/keyctl.h
index ec8f3d622a..3365945640 100644
--- a/include/linux/keyctl.h
+++ b/include/linux/keyctl.h
@@ -19,6 +19,7 @@
 #define KEY_SPEC_USER_KEYRING		-4	/* - key ID for UID-specific keyring */
 #define KEY_SPEC_USER_SESSION_KEYRING	-5	/* - key ID for UID-session keyring */
 #define KEY_SPEC_GROUP_KEYRING		-6	/* - key ID for GID-specific keyring */
+#define KEY_SPEC_REQKEY_AUTH_KEY	-7	/* - key ID for assumed request_key auth key */
 
 /* request-key default keyrings */
 #define KEY_REQKEY_DEFL_NO_CHANGE		-1
@@ -47,5 +48,6 @@
 #define KEYCTL_NEGATE			13	/* negate a partially constructed key */
 #define KEYCTL_SET_REQKEY_KEYRING	14	/* set default request-key keyring */
 #define KEYCTL_SET_TIMEOUT		15	/* set key timeout */
+#define KEYCTL_ASSUME_AUTHORITY		16	/* assume request_key() authorisation */
 
 #endif /*  _LINUX_KEYCTL_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 20bd707491..78eb92ae4d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -771,6 +771,7 @@ struct task_struct {
 	unsigned keep_capabilities:1;
 	struct user_struct *user;
 #ifdef CONFIG_KEYS
+	struct key *request_key_auth;	/* assumed request_key authority */
 	struct key *thread_keyring;	/* keyring private to this thread */
 	unsigned char jit_keyring;	/* default keyring to attach requested keys to */
 #endif
diff --git a/security/keys/compat.c b/security/keys/compat.c
index e8e7ef4a29..bcdb285337 100644
--- a/security/keys/compat.c
+++ b/security/keys/compat.c
@@ -77,6 +77,9 @@ asmlinkage long compat_sys_keyctl(u32 option,
 	case KEYCTL_SET_TIMEOUT:
 		return keyctl_set_timeout(arg2, arg3);
 
+	case KEYCTL_ASSUME_AUTHORITY:
+		return keyctl_assume_authority(arg2);
+
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/security/keys/internal.h b/security/keys/internal.h
index 51f37c0bdb..e066e60579 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -107,12 +107,13 @@ extern struct key *request_key_and_link(struct key_type *type,
 struct request_key_auth {
 	struct key		*target_key;
 	struct task_struct	*context;
+	const char		*callout_info;
 	pid_t			pid;
 };
 
 extern struct key_type key_type_request_key_auth;
 extern struct key *request_key_auth_new(struct key *target,
-					struct key **_rkakey);
+					const char *callout_info);
 
 extern struct key *key_get_instantiation_authkey(key_serial_t target_id);
 
@@ -137,6 +138,7 @@ extern long keyctl_instantiate_key(key_serial_t, const void __user *,
 extern long keyctl_negate_key(key_serial_t, unsigned, key_serial_t);
 extern long keyctl_set_reqkey_keyring(int);
 extern long keyctl_set_timeout(key_serial_t, unsigned);
+extern long keyctl_assume_authority(key_serial_t);
 
 
 /*
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index 299f0ae11c..3d2ebae029 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -834,6 +834,17 @@ long keyctl_instantiate_key(key_serial_t id,
 	if (plen > 32767)
 		goto error;
 
+	/* the appropriate instantiation authorisation key must have been
+	 * assumed before calling this */
+	ret = -EPERM;
+	instkey = current->request_key_auth;
+	if (!instkey)
+		goto error;
+
+	rka = instkey->payload.data;
+	if (rka->target_key->serial != id)
+		goto error;
+
 	/* pull the payload in if one was supplied */
 	payload = NULL;
 
@@ -848,15 +859,6 @@ long keyctl_instantiate_key(key_serial_t id,
 			goto error2;
 	}
 
-	/* find the instantiation authorisation key */
-	instkey = key_get_instantiation_authkey(id);
-	if (IS_ERR(instkey)) {
-		ret = PTR_ERR(instkey);
-		goto error2;
-	}
-
-	rka = instkey->payload.data;
-
 	/* find the destination keyring amongst those belonging to the
 	 * requesting task */
 	keyring_ref = NULL;
@@ -865,7 +867,7 @@ long keyctl_instantiate_key(key_serial_t id,
 					      KEY_WRITE);
 		if (IS_ERR(keyring_ref)) {
 			ret = PTR_ERR(keyring_ref);
-			goto error3;
+			goto error2;
 		}
 	}
 
@@ -874,11 +876,17 @@ long keyctl_instantiate_key(key_serial_t id,
 				       key_ref_to_ptr(keyring_ref), instkey);
 
 	key_ref_put(keyring_ref);
- error3:
-	key_put(instkey);
- error2:
+
+	/* discard the assumed authority if it's just been disabled by
+	 * instantiation of the key */
+	if (ret == 0) {
+		key_put(current->request_key_auth);
+		current->request_key_auth = NULL;
+	}
+
+error2:
 	kfree(payload);
- error:
+error:
 	return ret;
 
 } /* end keyctl_instantiate_key() */
@@ -895,14 +903,16 @@ long keyctl_negate_key(key_serial_t id, unsigned timeout, key_serial_t ringid)
 	key_ref_t keyring_ref;
 	long ret;
 
-	/* find the instantiation authorisation key */
-	instkey = key_get_instantiation_authkey(id);
-	if (IS_ERR(instkey)) {
-		ret = PTR_ERR(instkey);
+	/* the appropriate instantiation authorisation key must have been
+	 * assumed before calling this */
+	ret = -EPERM;
+	instkey = current->request_key_auth;
+	if (!instkey)
 		goto error;
-	}
 
 	rka = instkey->payload.data;
+	if (rka->target_key->serial != id)
+		goto error;
 
 	/* find the destination keyring if present (which must also be
 	 * writable) */
@@ -911,7 +921,7 @@ long keyctl_negate_key(key_serial_t id, unsigned timeout, key_serial_t ringid)
 		keyring_ref = lookup_user_key(NULL, ringid, 1, 0, KEY_WRITE);
 		if (IS_ERR(keyring_ref)) {
 			ret = PTR_ERR(keyring_ref);
-			goto error2;
+			goto error;
 		}
 	}
 
@@ -920,9 +930,15 @@ long keyctl_negate_key(key_serial_t id, unsigned timeout, key_serial_t ringid)
 				  key_ref_to_ptr(keyring_ref), instkey);
 
 	key_ref_put(keyring_ref);
- error2:
-	key_put(instkey);
- error:
+
+	/* discard the assumed authority if it's just been disabled by
+	 * instantiation of the key */
+	if (ret == 0) {
+		key_put(current->request_key_auth);
+		current->request_key_auth = NULL;
+	}
+
+error:
 	return ret;
 
 } /* end keyctl_negate_key() */
@@ -1005,6 +1021,48 @@ error:
 
 } /* end keyctl_set_timeout() */
 
+/*****************************************************************************/
+/*
+ * assume the authority to instantiate the specified key
+ */
+long keyctl_assume_authority(key_serial_t id)
+{
+	struct key *authkey;
+	long ret;
+
+	/* special key IDs aren't permitted */
+	ret = -EINVAL;
+	if (id < 0)
+		goto error;
+
+	/* we divest ourselves of authority if given an ID of 0 */
+	if (id == 0) {
+		key_put(current->request_key_auth);
+		current->request_key_auth = NULL;
+		ret = 0;
+		goto error;
+	}
+
+	/* attempt to assume the authority temporarily granted to us whilst we
+	 * instantiate the specified key
+	 * - the authorisation key must be in the current task's keyrings
+	 *   somewhere
+	 */
+	authkey = key_get_instantiation_authkey(id);
+	if (IS_ERR(authkey)) {
+		ret = PTR_ERR(authkey);
+		goto error;
+	}
+
+	key_put(current->request_key_auth);
+	current->request_key_auth = authkey;
+	ret = authkey->serial;
+
+error:
+	return ret;
+
+} /* end keyctl_assume_authority() */
+
 /*****************************************************************************/
 /*
  * the key control system call
@@ -1082,6 +1140,9 @@ asmlinkage long sys_keyctl(int option, unsigned long arg2, unsigned long arg3,
 		return keyctl_set_timeout((key_serial_t) arg2,
 					  (unsigned) arg3);
 
+	case KEYCTL_ASSUME_AUTHORITY:
+		return keyctl_assume_authority((key_serial_t) arg2);
+
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/security/keys/keyring.c b/security/keys/keyring.c
index 09d92d52ef..d65a180f88 100644
--- a/security/keys/keyring.c
+++ b/security/keys/keyring.c
@@ -479,51 +479,6 @@ key_ref_t __keyring_search_one(key_ref_t keyring_ref,
 
 } /* end __keyring_search_one() */
 
-/*****************************************************************************/
-/*
- * search for an instantiation authorisation key matching a target key
- * - the RCU read lock must be held by the caller
- * - a target_id of zero specifies any valid token
- */
-struct key *keyring_search_instkey(struct key *keyring,
-				   key_serial_t target_id)
-{
-	struct request_key_auth *rka;
-	struct keyring_list *klist;
-	struct key *instkey;
-	int loop;
-
-	klist = rcu_dereference(keyring->payload.subscriptions);
-	if (klist) {
-		for (loop = 0; loop < klist->nkeys; loop++) {
-			instkey = klist->keys[loop];
-
-			if (instkey->type != &key_type_request_key_auth)
-				continue;
-
-			rka = instkey->payload.data;
-			if (target_id && rka->target_key->serial != target_id)
-				continue;
-
-			/* the auth key is revoked during instantiation */
-			if (!test_bit(KEY_FLAG_REVOKED, &instkey->flags))
-				goto found;
-
-			instkey = ERR_PTR(-EKEYREVOKED);
-			goto error;
-		}
-	}
-
-	instkey = ERR_PTR(-EACCES);
-	goto error;
-
-found:
-	atomic_inc(&instkey->usage);
-error:
-	return instkey;
-
-} /* end keyring_search_instkey() */
-
 /*****************************************************************************/
 /*
  * find a keyring with the specified name
diff --git a/security/keys/permission.c b/security/keys/permission.c
index e7f579c0ea..3b41f9b525 100644
--- a/security/keys/permission.c
+++ b/security/keys/permission.c
@@ -73,3 +73,35 @@ use_these_perms:
 } /* end key_task_permission() */
 
 EXPORT_SYMBOL(key_task_permission);
+
+/*****************************************************************************/
+/*
+ * validate a key
+ */
+int key_validate(struct key *key)
+{
+	struct timespec now;
+	int ret = 0;
+
+	if (key) {
+		/* check it's still accessible */
+		ret = -EKEYREVOKED;
+		if (test_bit(KEY_FLAG_REVOKED, &key->flags) ||
+		    test_bit(KEY_FLAG_DEAD, &key->flags))
+			goto error;
+
+		/* check it hasn't expired */
+		ret = 0;
+		if (key->expiry) {
+			now = current_kernel_time();
+			if (now.tv_sec >= key->expiry)
+				ret = -EKEYEXPIRED;
+		}
+	}
+
+ error:
+	return ret;
+
+} /* end key_validate() */
+
+EXPORT_SYMBOL(key_validate);
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index 566b1cc011..74cb79eb91 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -270,9 +270,14 @@ int copy_thread_group_keys(struct task_struct *tsk)
 int copy_keys(unsigned long clone_flags, struct task_struct *tsk)
 {
 	key_check(tsk->thread_keyring);
+	key_check(tsk->request_key_auth);
 
 	/* no thread keyring yet */
 	tsk->thread_keyring = NULL;
+
+	/* copy the request_key() authorisation for this thread */
+	key_get(tsk->request_key_auth);
+
 	return 0;
 
 } /* end copy_keys() */
@@ -290,11 +295,12 @@ void exit_thread_group_keys(struct signal_struct *tg)
 
 /*****************************************************************************/
 /*
- * dispose of keys upon thread exit
+ * dispose of per-thread keys upon thread exit
  */
 void exit_keys(struct task_struct *tsk)
 {
 	key_put(tsk->thread_keyring);
+	key_put(tsk->request_key_auth);
 
 } /* end exit_keys() */
 
@@ -382,7 +388,7 @@ key_ref_t search_process_keyrings(struct key_type *type,
 				  struct task_struct *context)
 {
 	struct request_key_auth *rka;
-	key_ref_t key_ref, ret, err, instkey_ref;
+	key_ref_t key_ref, ret, err;
 
 	/* we want to return -EAGAIN or -ENOKEY if any of the keyrings were
 	 * searchable, but we failed to find a key or we found a negative key;
@@ -461,30 +467,12 @@ key_ref_t search_process_keyrings(struct key_type *type,
 			err = key_ref;
 			break;
 		}
-
-		/* if this process has a session keyring and that has an
-		 * instantiation authorisation key in the bottom level, then we
-		 * also search the keyrings of the process mentioned there */
-		if (context != current)
-			goto no_key;
-
-		rcu_read_lock();
-		instkey_ref = __keyring_search_one(
-			make_key_ref(rcu_dereference(
-					     context->signal->session_keyring),
-				     1),
-			&key_type_request_key_auth, NULL, 0);
-		rcu_read_unlock();
-
-		if (IS_ERR(instkey_ref))
-			goto no_key;
-
-		rka = key_ref_to_ptr(instkey_ref)->payload.data;
-
-		key_ref = search_process_keyrings(type, description, match,
-						  rka->context);
-		key_ref_put(instkey_ref);
-
+	}
+	/* or search the user-session keyring */
+	else {
+		key_ref = keyring_search_aux(
+			make_key_ref(context->user->session_keyring, 1),
+			context, type, description, match);
 		if (!IS_ERR(key_ref))
 			goto found;
 
@@ -500,11 +488,21 @@ key_ref_t search_process_keyrings(struct key_type *type,
 			break;
 		}
 	}
-	/* or search the user-session keyring */
-	else {
-		key_ref = keyring_search_aux(
-			make_key_ref(context->user->session_keyring, 1),
-			context, type, description, match);
+
+	/* if this process has an instantiation authorisation key, then we also
+	 * search the keyrings of the process mentioned there
+	 * - we don't permit access to request_key auth keys via this method
+	 */
+	if (context->request_key_auth &&
+	    context == current &&
+	    type != &key_type_request_key_auth &&
+	    key_validate(context->request_key_auth) == 0
+	    ) {
+		rka = context->request_key_auth->payload.data;
+
+		key_ref = search_process_keyrings(type, description, match,
+						  rka->context);
+
 		if (!IS_ERR(key_ref))
 			goto found;
 
@@ -521,8 +519,6 @@ key_ref_t search_process_keyrings(struct key_type *type,
 		}
 	}
 
-
-no_key:
 	/* no key - decide on the error we're going to go for */
 	key_ref = ret ? ret : err;
 
@@ -628,6 +624,15 @@ key_ref_t lookup_user_key(struct task_struct *context, key_serial_t id,
 		key = ERR_PTR(-EINVAL);
 		goto error;
 
+	case KEY_SPEC_REQKEY_AUTH_KEY:
+		key = context->request_key_auth;
+		if (!key)
+			goto error;
+
+		atomic_inc(&key->usage);
+		key_ref = make_key_ref(key, 1);
+		break;
+
 	default:
 		key_ref = ERR_PTR(-EINVAL);
 		if (id < 1)
diff --git a/security/keys/request_key.c b/security/keys/request_key.c
index 5cc4bba70d..f030a0ccbb 100644
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -29,28 +29,36 @@ DECLARE_WAIT_QUEUE_HEAD(request_key_conswq);
 /*****************************************************************************/
 /*
  * request userspace finish the construction of a key
- * - execute "/sbin/request-key <op> <key> <uid> <gid> <keyring> <keyring> <keyring> <info>"
+ * - execute "/sbin/request-key <op> <key> <uid> <gid> <keyring> <keyring> <keyring>"
  */
-static int call_request_key(struct key *key,
-			    const char *op,
-			    const char *callout_info)
+static int call_sbin_request_key(struct key *key,
+				 struct key *authkey,
+				 const char *op)
 {
 	struct task_struct *tsk = current;
 	key_serial_t prkey, sskey;
-	struct key *session_keyring, *rkakey;
-	char *argv[10], *envp[3], uid_str[12], gid_str[12];
+	struct key *keyring;
+	char *argv[9], *envp[3], uid_str[12], gid_str[12];
 	char key_str[12], keyring_str[3][12];
+	char desc[20];
 	int ret, i;
 
-	kenter("{%d},%s,%s", key->serial, op, callout_info);
+	kenter("{%d},{%d},%s", key->serial, authkey->serial, op);
 
-	/* generate a new session keyring with an auth key in it */
-	session_keyring = request_key_auth_new(key, &rkakey);
-	if (IS_ERR(session_keyring)) {
-		ret = PTR_ERR(session_keyring);
-		goto error;
+	/* allocate a new session keyring */
+	sprintf(desc, "_req.%u", key->serial);
+
+	keyring = keyring_alloc(desc, current->fsuid, current->fsgid, 1, NULL);
+	if (IS_ERR(keyring)) {
+		ret = PTR_ERR(keyring);
+		goto error_alloc;
 	}
 
+	/* attach the auth key to the session keyring */
+	ret = __key_link(keyring, authkey);
+	if (ret < 0)
+		goto error_link;
+
 	/* record the UID and GID */
 	sprintf(uid_str, "%d", current->fsuid);
 	sprintf(gid_str, "%d", current->fsgid);
@@ -95,22 +103,19 @@ static int call_request_key(struct key *key,
 	argv[i++] = keyring_str[0];
 	argv[i++] = keyring_str[1];
 	argv[i++] = keyring_str[2];
-	argv[i++] = (char *) callout_info;
 	argv[i] = NULL;
 
 	/* do it */
-	ret = call_usermodehelper_keys(argv[0], argv, envp, session_keyring, 1);
+	ret = call_usermodehelper_keys(argv[0], argv, envp, keyring, 1);
 
-	/* dispose of the special keys */
-	key_revoke(rkakey);
-	key_put(rkakey);
-	key_put(session_keyring);
+error_link:
+	key_put(keyring);
 
- error:
+error_alloc:
 	kleave(" = %d", ret);
 	return ret;
 
-} /* end call_request_key() */
+} /* end call_sbin_request_key() */
 
 /*****************************************************************************/
 /*
@@ -122,9 +127,10 @@ static struct key *__request_key_construction(struct key_type *type,
 					      const char *description,
 					      const char *callout_info)
 {
+	request_key_actor_t actor;
 	struct key_construction cons;
 	struct timespec now;
-	struct key *key;
+	struct key *key, *authkey;
 	int ret, negated;
 
 	kenter("%s,%s,%s", type->name, description, callout_info);
@@ -143,8 +149,19 @@ static struct key *__request_key_construction(struct key_type *type,
 	/* we drop the construction sem here on behalf of the caller */
 	up_write(&key_construction_sem);
 
+	/* allocate an authorisation key */
+	authkey = request_key_auth_new(key, callout_info);
+	if (IS_ERR(authkey)) {
+		ret = PTR_ERR(authkey);
+		authkey = NULL;
+		goto alloc_authkey_failed;
+	}
+
 	/* make the call */
-	ret = call_request_key(key, "create", callout_info);
+	actor = call_sbin_request_key;
+	if (type->request_key)
+		actor = type->request_key;
+	ret = actor(key, authkey, "create");
 	if (ret < 0)
 		goto request_failed;
 
@@ -153,22 +170,29 @@ static struct key *__request_key_construction(struct key_type *type,
 	if (!test_bit(KEY_FLAG_INSTANTIATED, &key->flags))
 		goto request_failed;
 
+	key_revoke(authkey);
+	key_put(authkey);
+
 	down_write(&key_construction_sem);
 	list_del(&cons.link);
 	up_write(&key_construction_sem);
 
 	/* also give an error if the key was negatively instantiated */
- check_not_negative:
+check_not_negative:
 	if (test_bit(KEY_FLAG_NEGATIVE, &key->flags)) {
 		key_put(key);
 		key = ERR_PTR(-ENOKEY);
 	}
 
- out:
+out:
 	kleave(" = %p", key);
 	return key;
 
- request_failed:
+request_failed:
+	key_revoke(authkey);
+	key_put(authkey);
+
+alloc_authkey_failed:
 	/* it wasn't instantiated
 	 * - remove from construction queue
 	 * - mark the key as dead
@@ -217,7 +241,7 @@ static struct key *__request_key_construction(struct key_type *type,
 	key = ERR_PTR(ret);
 	goto out;
 
- alloc_failed:
+alloc_failed:
 	up_write(&key_construction_sem);
 	goto out;
 
@@ -464,35 +488,3 @@ struct key *request_key(struct key_type *type,
 } /* end request_key() */
 
 EXPORT_SYMBOL(request_key);
-
-/*****************************************************************************/
-/*
- * validate a key
- */
-int key_validate(struct key *key)
-{
-	struct timespec now;
-	int ret = 0;
-
-	if (key) {
-		/* check it's still accessible */
-		ret = -EKEYREVOKED;
-		if (test_bit(KEY_FLAG_REVOKED, &key->flags) ||
-		    test_bit(KEY_FLAG_DEAD, &key->flags))
-			goto error;
-
-		/* check it hasn't expired */
-		ret = 0;
-		if (key->expiry) {
-			now = current_kernel_time();
-			if (now.tv_sec >= key->expiry)
-				ret = -EKEYEXPIRED;
-		}
-	}
-
- error:
-	return ret;
-
-} /* end key_validate() */
-
-EXPORT_SYMBOL(key_validate);
diff --git a/security/keys/request_key_auth.c b/security/keys/request_key_auth.c
index a8e4069d48..cce6ba6b03 100644
--- a/security/keys/request_key_auth.c
+++ b/security/keys/request_key_auth.c
@@ -15,11 +15,13 @@
 #include <linux/sched.h>
 #include <linux/err.h>
 #include <linux/seq_file.h>
+#include <asm/uaccess.h>
 #include "internal.h"
 
 static int request_key_auth_instantiate(struct key *, const void *, size_t);
 static void request_key_auth_describe(const struct key *, struct seq_file *);
 static void request_key_auth_destroy(struct key *);
+static long request_key_auth_read(const struct key *, char __user *, size_t);
 
 /*
  * the request-key authorisation key type definition
@@ -30,51 +32,25 @@ struct key_type key_type_request_key_auth = {
 	.instantiate	= request_key_auth_instantiate,
 	.describe	= request_key_auth_describe,
 	.destroy	= request_key_auth_destroy,
+	.read		= request_key_auth_read,
 };
 
 /*****************************************************************************/
 /*
- * instantiate a request-key authorisation record
+ * instantiate a request-key authorisation key
  */
 static int request_key_auth_instantiate(struct key *key,
 					const void *data,
 					size_t datalen)
 {
-	struct request_key_auth *rka, *irka;
-	struct key *instkey;
-	int ret;
-
-	ret = -ENOMEM;
-	rka = kmalloc(sizeof(*rka), GFP_KERNEL);
-	if (rka) {
-		/* see if the calling process is already servicing the key
-		 * request of another process */
-		instkey = key_get_instantiation_authkey(0);
-		if (!IS_ERR(instkey)) {
-			/* it is - use that instantiation context here too */
-			irka = instkey->payload.data;
-			rka->context = irka->context;
-			rka->pid = irka->pid;
-			key_put(instkey);
-		}
-		else {
-			/* it isn't - use this process as the context */
-			rka->context = current;
-			rka->pid = current->pid;
-		}
-
-		rka->target_key = key_get((struct key *) data);
-		key->payload.data = rka;
-		ret = 0;
-	}
-
-	return ret;
+	key->payload.data = (struct request_key_auth *) data;
+	return 0;
 
 } /* end request_key_auth_instantiate() */
 
 /*****************************************************************************/
 /*
- *
+ * reading a request-key authorisation key retrieves the callout information
  */
 static void request_key_auth_describe(const struct key *key,
 				      struct seq_file *m)
@@ -83,10 +59,38 @@ static void request_key_auth_describe(const struct key *key,
 
 	seq_puts(m, "key:");
 	seq_puts(m, key->description);
-	seq_printf(m, " pid:%d", rka->pid);
+	seq_printf(m, " pid:%d ci:%zu", rka->pid, strlen(rka->callout_info));
 
 } /* end request_key_auth_describe() */
 
+/*****************************************************************************/
+/*
+ * read the callout_info data
+ * - the key's semaphore is read-locked
+ */
+static long request_key_auth_read(const struct key *key,
+				  char __user *buffer, size_t buflen)
+{
+	struct request_key_auth *rka = key->payload.data;
+	size_t datalen;
+	long ret;
+
+	datalen = strlen(rka->callout_info);
+	ret = datalen;
+
+	/* we can return the data as is */
+	if (buffer && buflen > 0) {
+		if (buflen > datalen)
+			buflen = datalen;
+
+		if (copy_to_user(buffer, rka->callout_info, buflen) != 0)
+			ret = -EFAULT;
+	}
+
+	return ret;
+
+} /* end request_key_auth_read() */
+
 /*****************************************************************************/
 /*
  * destroy an instantiation authorisation token key
@@ -104,54 +108,87 @@ static void request_key_auth_destroy(struct key *key)
 
 /*****************************************************************************/
 /*
- * create a session keyring to be for the invokation of /sbin/request-key and
- * stick an authorisation token in it
+ * create an authorisation token for /sbin/request-key or whoever to gain
+ * access to the caller's security data
  */
-struct key *request_key_auth_new(struct key *target, struct key **_rkakey)
+struct key *request_key_auth_new(struct key *target, const char *callout_info)
 {
-	struct key *keyring, *rkakey = NULL;
+	struct request_key_auth *rka, *irka;
+	struct key *authkey = NULL;
 	char desc[20];
 	int ret;
 
 	kenter("%d,", target->serial);
 
-	/* allocate a new session keyring */
-	sprintf(desc, "_req.%u", target->serial);
+	/* allocate a auth record */
+	rka = kmalloc(sizeof(*rka), GFP_KERNEL);
+	if (!rka) {
+		kleave(" = -ENOMEM");
+		return ERR_PTR(-ENOMEM);
+	}
 
-	keyring = keyring_alloc(desc, current->fsuid, current->fsgid, 1, NULL);
-	if (IS_ERR(keyring)) {
-		kleave("= %ld", PTR_ERR(keyring));
-		return keyring;
+	/* see if the calling process is already servicing the key request of
+	 * another process */
+	if (current->request_key_auth) {
+		/* it is - use that instantiation context here too */
+		irka = current->request_key_auth->payload.data;
+		rka->context = irka->context;
+		rka->pid = irka->pid;
 	}
+	else {
+		/* it isn't - use this process as the context */
+		rka->context = current;
+		rka->pid = current->pid;
+	}
+
+	rka->target_key = key_get(target);
+	rka->callout_info = callout_info;
 
 	/* allocate the auth key */
 	sprintf(desc, "%x", target->serial);
 
-	rkakey = key_alloc(&key_type_request_key_auth, desc,
-			   current->fsuid, current->fsgid,
-			   KEY_POS_VIEW | KEY_USR_VIEW, 1);
-	if (IS_ERR(rkakey)) {
-		key_put(keyring);
-		kleave("= %ld", PTR_ERR(rkakey));
-		return rkakey;
+	authkey = key_alloc(&key_type_request_key_auth, desc,
+			    current->fsuid, current->fsgid,
+			    KEY_POS_VIEW | KEY_POS_READ | KEY_POS_SEARCH |
+			    KEY_USR_VIEW, 1);
+	if (IS_ERR(authkey)) {
+		ret = PTR_ERR(authkey);
+		goto error_alloc;
 	}
 
 	/* construct and attach to the keyring */
-	ret = key_instantiate_and_link(rkakey, target, 0, keyring, NULL);
-	if (ret < 0) {
-		key_revoke(rkakey);
-		key_put(rkakey);
-		key_put(keyring);
-		kleave("= %d", ret);
-		return ERR_PTR(ret);
-	}
+	ret = key_instantiate_and_link(authkey, rka, 0, NULL, NULL);
+	if (ret < 0)
+		goto error_inst;
 
-	*_rkakey = rkakey;
-	kleave(" = {%d} ({%d})", keyring->serial, rkakey->serial);
-	return keyring;
+	kleave(" = {%d})", authkey->serial);
+	return authkey;
+
+error_inst:
+	key_revoke(authkey);
+	key_put(authkey);
+error_alloc:
+	key_put(rka->target_key);
+	kfree(rka);
+	kleave("= %d", ret);
+	return ERR_PTR(ret);
 
 } /* end request_key_auth_new() */
 
+/*****************************************************************************/
+/*
+ * see if an authorisation key is associated with a particular key
+ */
+static int key_get_instantiation_authkey_match(const struct key *key,
+					       const void *_id)
+{
+	struct request_key_auth *rka = key->payload.data;
+	key_serial_t id = (key_serial_t)(unsigned long) _id;
+
+	return rka->target_key->serial == id;
+
+} /* end key_get_instantiation_authkey_match() */
+
 /*****************************************************************************/
 /*
  * get the authorisation key for instantiation of a specific key if attached to
@@ -162,22 +199,27 @@ struct key *request_key_auth_new(struct key *target, struct key **_rkakey)
  */
 struct key *key_get_instantiation_authkey(key_serial_t target_id)
 {
-	struct task_struct *tsk = current;
-	struct key *instkey;
-
-	/* we must have our own personal session keyring */
-	if (!tsk->signal->session_keyring)
-		return ERR_PTR(-EACCES);
-
-	/* and it must contain a suitable request authorisation key
-	 * - lock RCU against session keyring changing
-	 */
-	rcu_read_lock();
+	struct key *authkey;
+	key_ref_t authkey_ref;
+
+	authkey_ref = search_process_keyrings(
+		&key_type_request_key_auth,
+		(void *) (unsigned long) target_id,
+		key_get_instantiation_authkey_match,
+		current);
+
+	if (IS_ERR(authkey_ref)) {
+		authkey = ERR_PTR(PTR_ERR(authkey_ref));
+		goto error;
+	}
 
-	instkey = keyring_search_instkey(
-		rcu_dereference(tsk->signal->session_keyring), target_id);
+	authkey = key_ref_to_ptr(authkey_ref);
+	if (test_bit(KEY_FLAG_REVOKED, &authkey->flags)) {
+		key_put(authkey);
+		authkey = ERR_PTR(-EKEYREVOKED);
+	}
 
-	rcu_read_unlock();
-	return instkey;
+error:
+	return authkey;
 
 } /* end key_get_instantiation_authkey() */
-- 
cgit v1.2.2


From 71fabd5e4835309b4feca6209122ce56c595c461 Mon Sep 17 00:00:00 2001
From: George Anzinger <george@mvista.com>
Date: Sun, 8 Jan 2006 01:02:48 -0800
Subject: [PATCH] sigaction should clear all signals on SIG_IGN, not just < 32

While rooting aroung in the signal code trying to understand how to fix the
SIG_IGN ploy (set sig handler to SIG_IGN and flood system with high speed
repeating timers) I came across what, I think, is a problem in sigaction()
in that when processing a SIG_IGN request it flushes signals from 1 to
SIGRTMIN and leaves the rest.  Attempt to fix this.

Signed-off-by: George Anzinger <george@mvista.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Linus Torvalds <torvalds@osdl.org>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/signal.h | 17 +++++++++++++++++
 kernel/signal.c        | 34 ++++++++++++++++++++++++++++++++--
 2 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/include/linux/signal.h b/include/linux/signal.h
index ea9eff16c4..b7d093520b 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -94,6 +94,23 @@ static inline int sigfindinword(unsigned long word)
 
 #endif /* __HAVE_ARCH_SIG_BITOPS */
 
+static inline int sigisemptyset(sigset_t *set)
+{
+	extern void _NSIG_WORDS_is_unsupported_size(void);
+	switch (_NSIG_WORDS) {
+	case 4:
+		return (set->sig[3] | set->sig[2] |
+			set->sig[1] | set->sig[0]) == 0;
+	case 2:
+		return (set->sig[1] | set->sig[0]) == 0;
+	case 1:
+		return set->sig[0] == 0;
+	default:
+		_NSIG_WORDS_is_unsupported_size();
+		return 0;
+	}
+}
+
 #define sigmask(sig)	(1UL << ((sig) - 1))
 
 #ifndef __HAVE_ARCH_SIG_SETOPS
diff --git a/kernel/signal.c b/kernel/signal.c
index 9b6fda5e87..e20724af9b 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -620,6 +620,33 @@ void signal_wake_up(struct task_struct *t, int resume)
 		kick_process(t);
 }
 
+/*
+ * Remove signals in mask from the pending set and queue.
+ * Returns 1 if any signals were found.
+ *
+ * All callers must be holding the siglock.
+ *
+ * This version takes a sigset mask and looks at all signals,
+ * not just those in the first mask word.
+ */
+static int rm_from_queue_full(sigset_t *mask, struct sigpending *s)
+{
+	struct sigqueue *q, *n;
+	sigset_t m;
+
+	sigandsets(&m, mask, &s->signal);
+	if (sigisemptyset(&m))
+		return 0;
+
+	signandsets(&s->signal, &s->signal, mask);
+	list_for_each_entry_safe(q, n, &s->list, list) {
+		if (sigismember(mask, q->info.si_signo)) {
+			list_del_init(&q->list);
+			__sigqueue_free(q);
+		}
+	}
+	return 1;
+}
 /*
  * Remove signals in mask from the pending set and queue.
  * Returns 1 if any signals were found.
@@ -2408,6 +2435,7 @@ int
 do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact)
 {
 	struct k_sigaction *k;
+	sigset_t mask;
 
 	if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig)))
 		return -EINVAL;
@@ -2455,9 +2483,11 @@ do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact)
 			*k = *act;
 			sigdelsetmask(&k->sa.sa_mask,
 				      sigmask(SIGKILL) | sigmask(SIGSTOP));
-			rm_from_queue(sigmask(sig), &t->signal->shared_pending);
+			sigemptyset(&mask);
+			sigaddset(&mask, sig);
+			rm_from_queue_full(&mask, &t->signal->shared_pending);
 			do {
-				rm_from_queue(sigmask(sig), &t->pending);
+				rm_from_queue_full(&mask, &t->pending);
 				recalc_sigpending_tsk(t);
 				t = next_thread(t);
 			} while (t != current);
-- 
cgit v1.2.2


From 5b0ed2c64d8fdafb5fcfb3baabdd288628b1ff9b Mon Sep 17 00:00:00 2001
From: Xose Vazquez Perez <xose.vazquez@gmail.com>
Date: Sun, 8 Jan 2006 01:02:49 -0800
Subject: [PATCH] docs: updated some code docs

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/CodingStyle       |  7 +++--
 Documentation/SubmittingDrivers | 24 +++++++++++------
 Documentation/SubmittingPatches | 60 ++++++++++++++++++++++++++++-------------
 3 files changed, 63 insertions(+), 28 deletions(-)

diff --git a/Documentation/CodingStyle b/Documentation/CodingStyle
index eb7db3c192..187e12077e 100644
--- a/Documentation/CodingStyle
+++ b/Documentation/CodingStyle
@@ -444,10 +444,13 @@ ISBN 0-201-61586-X.
 URL: http://cm.bell-labs.com/cm/cs/tpop/
 
 GNU manuals - where in compliance with K&R and this text - for cpp, gcc,
-gcc internals and indent, all available from http://www.gnu.org
+gcc internals and indent, all available from http://www.gnu.org/manual/
 
 WG14 is the international standardization working group for the programming
-language C, URL: http://std.dkuug.dk/JTC1/SC22/WG14/
+language C, URL: http://www.open-std.org/JTC1/SC22/WG14/
+
+Kernel CodingStyle, by greg@kroah.com at OLS 2002:
+http://www.kroah.com/linux/talks/ols_2002_kernel_codingstyle_talk/html/
 
 --
 Last updated on 16 February 2004 by a community effort on LKML.
diff --git a/Documentation/SubmittingDrivers b/Documentation/SubmittingDrivers
index c3cca924e9..dd311cff1c 100644
--- a/Documentation/SubmittingDrivers
+++ b/Documentation/SubmittingDrivers
@@ -27,18 +27,17 @@ Who To Submit Drivers To
 ------------------------
 
 Linux 2.0:
-	No new drivers are accepted for this kernel tree
+	No new drivers are accepted for this kernel tree.
 
 Linux 2.2:
+	No new drivers are accepted for this kernel tree.
+
+Linux 2.4:
 	If the code area has a general maintainer then please submit it to
 	the maintainer listed in MAINTAINERS in the kernel file. If the
 	maintainer does not respond or you cannot find the appropriate
-	maintainer then please contact the 2.2 kernel maintainer:
-	Marc-Christian Petersen <m.c.p@wolk-project.de>.
-
-Linux 2.4:
-	The same rules apply as 2.2. The final contact point for Linux 2.4
-	submissions is Marcelo Tosatti <marcelo.tosatti@cyclades.com>.
+	maintainer then please contact Marcelo Tosatti
+	<marcelo.tosatti@cyclades.com>.
 
 Linux 2.6:
 	The same rules apply as 2.4 except that you should follow linux-kernel
@@ -53,6 +52,7 @@ Licensing:	The code must be released to us under the
 		of exclusive GPL licensing, and if you wish the driver
 		to be useful to other communities such as BSD you may well
 		wish to release under multiple licenses.
+		See accepted licenses at include/linux/module.h
 
 Copyright:	The copyright owner must agree to use of GPL.
 		It's best if the submitter and copyright owner
@@ -143,5 +143,13 @@ KernelNewbies:
 	http://kernelnewbies.org/
 
 Linux USB project:
-	http://sourceforge.net/projects/linux-usb/
+	http://linux-usb.sourceforge.net/
+
+How to NOT write kernel driver by arjanv@redhat.com
+	http://people.redhat.com/arjanv/olspaper.pdf
+
+Kernel Janitor:
+	http://janitor.kernelnewbies.org/
 
+--
+Last updated on 17 Nov 2005.
diff --git a/Documentation/SubmittingPatches b/Documentation/SubmittingPatches
index 1d47e6c09d..3c12fc1469 100644
--- a/Documentation/SubmittingPatches
+++ b/Documentation/SubmittingPatches
@@ -78,7 +78,9 @@ Randy Dunlap's patch scripts:
 http://www.xenotime.net/linux/scripts/patching-scripts-002.tar.gz
 
 Andrew Morton's patch scripts:
-http://www.zip.com.au/~akpm/linux/patches/patch-scripts-0.20
+http://www.zip.com.au/~akpm/linux/patches/
+Instead of these scripts, quilt is the recommended patch management
+tool (see above).
 
 
@@ -97,7 +99,7 @@ need to split up your patch.  See #3, next.
 
 3) Separate your changes.
 
-Separate each logical change into its own patch.
+Separate _logical changes_ into a single patch file.
 
 For example, if your changes include both bug fixes and performance
 enhancements for a single driver, separate those changes into two
@@ -112,6 +114,10 @@ If one patch depends on another patch in order for a change to be
 complete, that is OK.  Simply note "this patch depends on patch X"
 in your patch description.
 
+If you cannot condense your patch set into a smaller set of patches,
+then only post say 15 or so at a time and wait for review and integration.
+
+
 
 4) Select e-mail destination.
 
@@ -124,6 +130,10 @@ your patch to the primary Linux kernel developer's mailing list,
 linux-kernel@vger.kernel.org.  Most kernel developers monitor this
 e-mail list, and can comment on your changes.
 
+
+Do not send more than 15 patches at once to the vger mailing lists!!!
+
+
 Linus Torvalds is the final arbiter of all changes accepted into the
 Linux kernel.  His e-mail address is <torvalds@osdl.org>.  He gets
 a lot of e-mail, so typically you should do your best to -avoid- sending
@@ -149,6 +159,9 @@ USB, framebuffer devices, the VFS, the SCSI subsystem, etc.  See the
 MAINTAINERS file for a mailing list that relates specifically to
 your change.
 
+Majordomo lists of VGER.KERNEL.ORG at:
+	<http://vger.kernel.org/vger-lists.html>
+
 If changes affect userland-kernel interfaces, please send
 the MAN-PAGES maintainer (as listed in the MAINTAINERS file)
 a man-pages patch, or at least a notification of the change,
@@ -378,22 +391,6 @@ See more details on the proper patch format in the following
 references.
 
 
-13) More references for submitting patches
-
-Andrew Morton, "The perfect patch" (tpp).
-  <http://www.zip.com.au/~akpm/linux/patches/stuff/tpp.txt>
-
-Jeff Garzik, "Linux kernel patch submission format."
-  <http://linux.yyz.us/patch-format.html>
-
-Greg KH, "How to piss off a kernel subsystem maintainer"
-  <http://www.kroah.com/log/2005/03/31/>
-
-Kernel Documentation/CodingStyle
-  <http://sosdg.org/~coywolf/lxr/source/Documentation/CodingStyle>
-
-Linus Torvald's mail on the canonical patch format:
-  <http://lkml.org/lkml/2005/4/7/183>
 
 
 -----------------------------------
@@ -466,3 +463,30 @@ and 'extern __inline__'.
 Don't try to anticipate nebulous future cases which may or may not
 be useful:  "Make it as simple as you can, and no simpler."
 
+
+
+----------------------
+SECTION 3 - REFERENCES
+----------------------
+
+Andrew Morton, "The perfect patch" (tpp).
+  <http://www.zip.com.au/~akpm/linux/patches/stuff/tpp.txt>
+
+Jeff Garzik, "Linux kernel patch submission format."
+  <http://linux.yyz.us/patch-format.html>
+
+Greg Kroah, "How to piss off a kernel subsystem maintainer".
+  <http://www.kroah.com/log/2005/03/31/>
+  <http://www.kroah.com/log/2005/07/08/>
+  <http://www.kroah.com/log/2005/10/19/>
+
+NO!!!! No more huge patch bombs to linux-kernel@vger.kernel.org people!.
+  <http://marc.theaimsgroup.com/?l=linux-kernel&m=112112749912944&w=2>
+
+Kernel Documentation/CodingStyle
+  <http://sosdg.org/~coywolf/lxr/source/Documentation/CodingStyle>
+
+Linus Torvald's mail on the canonical patch format:
+  <http://lkml.org/lkml/2005/4/7/183>
+--
+Last updated on 17 Nov 2005.
-- 
cgit v1.2.2


From a885c8c4316e1c1d2d2c8755da3f3d14f852528d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 8 Jan 2006 01:02:50 -0800
Subject: [PATCH] Add block_device_operations.getgeo block device method

HDIO_GETGEO is implemented in most block drivers, and all of them have to
duplicate the code to copy the structure to userspace, as well as getting
the start sector.  This patch moves that to common code [1] and adds a
->getgeo method to fill out the raw kernel hd_geometry structure.  For many
drivers this means ->ioctl can go away now.

[1] the s390 block drivers are odd in this respect.  xpram sets ->start
    to 4 always which seems more than odd, and the dasd driver shifts
    the start offset around, probably because of it's non-standard
    sector size.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: Jens Axboe <axboe@suse.de>
Cc: <mike.miller@hp.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Paolo Giarrusso <blaisorblade@yahoo.it>
Cc: Bartlomiej Zolnierkiewicz <B.Zolnierkiewicz@elka.pw.edu.pl>
Cc: Neil Brown <neilb@cse.unsw.edu.au>
Cc: Markus Lidel <Markus.Lidel@shadowconnect.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: James Bottomley <James.Bottomley@steeleye.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/drivers/ubd_kern.c      | 21 +++++++++--------
 block/ioctl.c                   | 22 ++++++++++++++++++
 drivers/acorn/block/mfmhd.c     | 36 ++++++-----------------------
 drivers/block/DAC960.c          | 35 ++++++++++++-----------------
 drivers/block/acsi.c            | 26 +++++++++++----------
 drivers/block/amiflop.c         | 23 +++++++++----------
 drivers/block/aoe/aoeblk.c      | 26 ++++++---------------
 drivers/block/cciss.c           | 31 ++++++++++++-------------
 drivers/block/cpqarray.c        | 36 +++++++++++++++--------------
 drivers/block/floppy.c          | 35 +++++++++++++++--------------
 drivers/block/paride/pd.c       | 34 +++++++++++++++-------------
 drivers/block/paride/pf.c       | 50 +++++++++++++++++++++--------------------
 drivers/block/ps2esdi.c         | 25 +++++++--------------
 drivers/block/sx8.c             | 35 +++++++----------------------
 drivers/block/umem.c            | 41 +++++++++++++--------------------
 drivers/block/viodasd.c         | 44 ++++++++----------------------------
 drivers/block/xd.c              | 25 +++++++++++----------
 drivers/ide/ide-disk.c          | 12 ++++++++++
 drivers/ide/ide-floppy.c        | 12 ++++++++++
 drivers/ide/ide.c               | 13 -----------
 drivers/ide/legacy/hd.c         | 24 +++++++-------------
 drivers/md/md.c                 | 30 +++++++++----------------
 drivers/message/i2o/i2o_block.c | 18 +++++++--------
 drivers/mmc/mmc_block.c         | 25 +++++----------------
 drivers/mtd/mtd_blkdevs.c       | 25 ++++++++-------------
 drivers/s390/block/dasd.c       | 23 +++++++++++++++++++
 drivers/s390/block/dasd_ioctl.c | 28 -----------------------
 drivers/s390/block/xpram.c      | 18 ++++++---------
 drivers/scsi/sd.c               | 21 +++++------------
 include/linux/fs.h              |  2 ++
 30 files changed, 340 insertions(+), 456 deletions(-)

diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index 73f9652b2e..3a93c6f772 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -117,6 +117,7 @@ static int ubd_open(struct inode * inode, struct file * filp);
 static int ubd_release(struct inode * inode, struct file * file);
 static int ubd_ioctl(struct inode * inode, struct file * file,
 		     unsigned int cmd, unsigned long arg);
+static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo);
 
 #define MAX_DEV (8)
 
@@ -125,6 +126,7 @@ static struct block_device_operations ubd_blops = {
         .open		= ubd_open,
         .release	= ubd_release,
         .ioctl		= ubd_ioctl,
+	.getgeo		= ubd_getgeo,
 };
 
 /* Protected by the queue_lock */
@@ -1058,6 +1060,16 @@ static void do_ubd_request(request_queue_t *q)
 	}
 }
 
+static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	struct ubd *dev = bdev->bd_disk->private_data;
+
+	geo->heads = 128;
+	geo->sectors = 32;
+	geo->cylinders = dev->size / (128 * 32 * 512);
+	return 0;
+}
+
 static int ubd_ioctl(struct inode * inode, struct file * file,
 		     unsigned int cmd, unsigned long arg)
 {
@@ -1070,16 +1082,7 @@ static int ubd_ioctl(struct inode * inode, struct file * file,
 	};
 
 	switch (cmd) {
-	        struct hd_geometry g;
 		struct cdrom_volctrl volume;
-	case HDIO_GETGEO:
-		if(!loc) return(-EINVAL);
-		g.heads = 128;
-		g.sectors = 32;
-		g.cylinders = dev->size / (128 * 32 * 512);
-		g.start = get_start_sect(inode->i_bdev);
-		return(copy_to_user(loc, &g, sizeof(g)) ? -EFAULT : 0);
-
 	case HDIO_GET_IDENTITY:
 		ubd_id.cyls = dev->size / (128 * 32 * 512);
 		if(copy_to_user((char __user *) arg, (char *) &ubd_id,
diff --git a/block/ioctl.c b/block/ioctl.c
index 6e278474f9..82030e1dfd 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -1,6 +1,7 @@
 #include <linux/sched.h>		/* for capable() */
 #include <linux/blkdev.h>
 #include <linux/blkpg.h>
+#include <linux/hdreg.h>
 #include <linux/backing-dev.h>
 #include <linux/buffer_head.h>
 #include <linux/smp_lock.h>
@@ -245,6 +246,27 @@ int blkdev_ioctl(struct inode *inode, struct file *file, unsigned cmd,
 		set_device_ro(bdev, n);
 		unlock_kernel();
 		return 0;
+	case HDIO_GETGEO: {
+		struct hd_geometry geo;
+
+		if (!arg)
+			return -EINVAL;
+		if (!disk->fops->getgeo)
+			return -ENOTTY;
+
+		/*
+		 * We need to set the startsect first, the driver may
+		 * want to override it.
+		 */
+		geo.start = get_start_sect(bdev);
+		ret = disk->fops->getgeo(bdev, &geo);
+		if (ret)
+			return ret;
+		if (copy_to_user((struct hd_geometry __user *)arg, &geo,
+					sizeof(geo)))
+			return -EFAULT;
+		return 0;
+	}
 	}
 
 	lock_kernel();
diff --git a/drivers/acorn/block/mfmhd.c b/drivers/acorn/block/mfmhd.c
index 4b65f74d66..ce074f6f33 100644
--- a/drivers/acorn/block/mfmhd.c
+++ b/drivers/acorn/block/mfmhd.c
@@ -129,19 +129,6 @@ static DEFINE_SPINLOCK(mfm_lock);
 #define MAJOR_NR	MFM_ACORN_MAJOR
 #define QUEUE (mfm_queue)
 #define CURRENT elv_next_request(mfm_queue)
-/*
- * This sort of stuff should be in a header file shared with ide.c, hd.c, xd.c etc
- */
-#ifndef HDIO_GETGEO
-#define HDIO_GETGEO 0x301
-struct hd_geometry {
-	unsigned char heads;
-	unsigned char sectors;
-	unsigned short cylinders;
-	unsigned long start;
-};
-#endif
-
 
 /*
  * Configuration section
@@ -1153,22 +1140,13 @@ static int mfm_initdrives(void)
  * The 'front' end of the mfm driver follows...
  */
 
-static int mfm_ioctl(struct inode *inode, struct file *file, u_int cmd, u_long arg)
+static int mfm_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
-	struct mfm_info *p = inode->i_bdev->bd_disk->private_data;
-	struct hd_geometry *geo = (struct hd_geometry *) arg;
-	if (cmd != HDIO_GETGEO)
-		return -EINVAL;
-	if (!arg)
-		return -EINVAL;
-	if (put_user (p->heads, &geo->heads))
-		return -EFAULT;
-	if (put_user (p->sectors, &geo->sectors))
-		return -EFAULT;
-	if (put_user (p->cylinders, &geo->cylinders))
-		return -EFAULT;
-	if (put_user (get_start_sect(inode->i_bdev), &geo->start))
-		return -EFAULT;
+	struct mfm_info *p = bdev->bd_disk->private_data;
+
+	geo->heads = p->heads;
+	geo->sectors = p->sectors;
+	geo->cylinders = p->cylinders;
 	return 0;
 }
 
@@ -1219,7 +1197,7 @@ void xd_set_geometry(struct block_device *bdev, unsigned char secsptrack,
 static struct block_device_operations mfm_fops =
 {
 	.owner		= THIS_MODULE,
-	.ioctl		= mfm_ioctl,
+	.getgeo		= mfm_getgeo,
 };
 
 /*
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index 21097a39a0..179c68a3ce 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -92,34 +92,28 @@ static int DAC960_open(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static int DAC960_ioctl(struct inode *inode, struct file *file,
-			unsigned int cmd, unsigned long arg)
+static int DAC960_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
-	struct gendisk *disk = inode->i_bdev->bd_disk;
+	struct gendisk *disk = bdev->bd_disk;
 	DAC960_Controller_T *p = disk->queue->queuedata;
 	int drive_nr = (long)disk->private_data;
-	struct hd_geometry g;
-	struct hd_geometry __user *loc = (struct hd_geometry __user *)arg;
-
-	if (cmd != HDIO_GETGEO || !loc)
-		return -EINVAL;
 
 	if (p->FirmwareType == DAC960_V1_Controller) {
-		g.heads = p->V1.GeometryTranslationHeads;
-		g.sectors = p->V1.GeometryTranslationSectors;
-		g.cylinders = p->V1.LogicalDriveInformation[drive_nr].
-			LogicalDriveSize / (g.heads * g.sectors);
+		geo->heads = p->V1.GeometryTranslationHeads;
+		geo->sectors = p->V1.GeometryTranslationSectors;
+		geo->cylinders = p->V1.LogicalDriveInformation[drive_nr].
+			LogicalDriveSize / (geo->heads * geo->sectors);
 	} else {
 		DAC960_V2_LogicalDeviceInfo_T *i =
 			p->V2.LogicalDeviceInformation[drive_nr];
 		switch (i->DriveGeometry) {
 		case DAC960_V2_Geometry_128_32:
-			g.heads = 128;
-			g.sectors = 32;
+			geo->heads = 128;
+			geo->sectors = 32;
 			break;
 		case DAC960_V2_Geometry_255_63:
-			g.heads = 255;
-			g.sectors = 63;
+			geo->heads = 255;
+			geo->sectors = 63;
 			break;
 		default:
 			DAC960_Error("Illegal Logical Device Geometry %d\n",
@@ -127,12 +121,11 @@ static int DAC960_ioctl(struct inode *inode, struct file *file,
 			return -EINVAL;
 		}
 
-		g.cylinders = i->ConfigurableDeviceSize / (g.heads * g.sectors);
+		geo->cylinders = i->ConfigurableDeviceSize /
+			(geo->heads * geo->sectors);
 	}
 	
-	g.start = get_start_sect(inode->i_bdev);
-
-	return copy_to_user(loc, &g, sizeof g) ? -EFAULT : 0; 
+	return 0;
 }
 
 static int DAC960_media_changed(struct gendisk *disk)
@@ -157,7 +150,7 @@ static int DAC960_revalidate_disk(struct gendisk *disk)
 static struct block_device_operations DAC960_BlockDeviceOperations = {
 	.owner			= THIS_MODULE,
 	.open			= DAC960_open,
-	.ioctl			= DAC960_ioctl,
+	.getgeo			= DAC960_getgeo,
 	.media_changed		= DAC960_media_changed,
 	.revalidate_disk	= DAC960_revalidate_disk,
 };
diff --git a/drivers/block/acsi.c b/drivers/block/acsi.c
index 5d2d649f7e..196c0ec9cd 100644
--- a/drivers/block/acsi.c
+++ b/drivers/block/acsi.c
@@ -1079,6 +1079,19 @@ static void redo_acsi_request( void )
  *
  ***********************************************************************/
 
+static int acsi_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	struct acsi_info_struct *aip = bdev->bd_disk->private_data;
+
+	/*
+	 * Just fake some geometry here, it's nonsense anyway
+	 * To make it easy, use Adaptec's usual 64/32 mapping
+	 */
+	geo->heads = 64;
+	geo->sectors = 32;
+	geo->cylinders = aip->size >> 11;
+	return 0;
+}
 
 static int acsi_ioctl( struct inode *inode, struct file *file,
 					   unsigned int cmd, unsigned long arg )
@@ -1086,18 +1099,6 @@ static int acsi_ioctl( struct inode *inode, struct file *file,
 	struct gendisk *disk = inode->i_bdev->bd_disk;
 	struct acsi_info_struct *aip = disk->private_data;
 	switch (cmd) {
-	  case HDIO_GETGEO:
-		/* HDIO_GETGEO is supported more for getting the partition's
-		 * start sector... */
-	  { struct hd_geometry *geo = (struct hd_geometry *)arg;
-	    /* just fake some geometry here, it's nonsense anyway; to make it
-		 * easy, use Adaptec's usual 64/32 mapping */
-	    put_user( 64, &geo->heads );
-	    put_user( 32, &geo->sectors );
-	    put_user( aip->size >> 11, &geo->cylinders );
-		put_user(get_start_sect(inode->i_bdev), &geo->start);
-		return 0;
-	  }
 	  case SCSI_IOCTL_GET_IDLUN:
 		/* SCSI compatible GET_IDLUN call to get target's ID and LUN number */
 		put_user( aip->target | (aip->lun << 8),
@@ -1592,6 +1593,7 @@ static struct block_device_operations acsi_fops = {
 	.open		= acsi_open,
 	.release	= acsi_release,
 	.ioctl		= acsi_ioctl,
+	.getgeo		= acsi_getgeo,
 	.media_changed	= acsi_media_change,
 	.revalidate_disk= acsi_revalidate,
 };
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index 0acbfff8ad..cb2a545e57 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -1424,6 +1424,16 @@ static void do_fd_request(request_queue_t * q)
 	redo_fd_request();
 }
 
+static int fd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	int drive = MINOR(bdev->bd_dev) & 3;
+
+	geo->heads = unit[drive].type->heads;
+	geo->sectors = unit[drive].dtype->sects * unit[drive].type->sect_mult;
+	geo->cylinders = unit[drive].type->tracks;
+	return 0;
+}
+
 static int fd_ioctl(struct inode *inode, struct file *filp,
 		    unsigned int cmd, unsigned long param)
 {
@@ -1431,18 +1441,6 @@ static int fd_ioctl(struct inode *inode, struct file *filp,
 	static struct floppy_struct getprm;
 
 	switch(cmd){
-	case HDIO_GETGEO:
-	{
-		struct hd_geometry loc;
-		loc.heads = unit[drive].type->heads;
-		loc.sectors = unit[drive].dtype->sects * unit[drive].type->sect_mult;
-		loc.cylinders = unit[drive].type->tracks;
-		loc.start = 0;
-		if (copy_to_user((void *)param, (void *)&loc,
-				 sizeof(struct hd_geometry)))
-			return -EFAULT;
-		break;
-	}
 	case FDFMTBEG:
 		get_fdc(drive);
 		if (fd_ref[drive] > 1) {
@@ -1652,6 +1650,7 @@ static struct block_device_operations floppy_fops = {
 	.open		= floppy_open,
 	.release	= floppy_release,
 	.ioctl		= fd_ioctl,
+	.getgeo		= fd_getgeo,
 	.media_changed	= amiga_floppy_change,
 };
 
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index 0e97fcb9f3..c05ee8bffd 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -169,38 +169,26 @@ aoeblk_make_request(request_queue_t *q, struct bio *bio)
 	return 0;
 }
 
-/* This ioctl implementation expects userland to have the device node
- * permissions set so that only priviledged users can open an aoe
- * block device directly.
- */
 static int
-aoeblk_ioctl(struct inode *inode, struct file *filp, uint cmd, ulong arg)
+aoeblk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
-	struct aoedev *d;
-
-	if (!arg)
-		return -EINVAL;
+	struct aoedev *d = bdev->bd_disk->private_data;
 
-	d = inode->i_bdev->bd_disk->private_data;
 	if ((d->flags & DEVFL_UP) == 0) {
 		printk(KERN_ERR "aoe: aoeblk_ioctl: disk not up\n");
 		return -ENODEV;
 	}
 
-	if (cmd == HDIO_GETGEO) {
-		d->geo.start = get_start_sect(inode->i_bdev);
-		if (!copy_to_user((void __user *) arg, &d->geo, sizeof d->geo))
-			return 0;
-		return -EFAULT;
-	}
-	printk(KERN_INFO "aoe: aoeblk_ioctl: unknown ioctl %d\n", cmd);
-	return -EINVAL;
+	geo->cylinders = d->geo.cylinders;
+	geo->heads = d->geo.heads;
+	geo->sectors = d->geo.sectors;
+	return 0;
 }
 
 static struct block_device_operations aoe_bdops = {
 	.open = aoeblk_open,
 	.release = aoeblk_release,
-	.ioctl = aoeblk_ioctl,
+	.getgeo = aoeblk_getgeo,
 	.owner = THIS_MODULE,
 };
 
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index d2815b7a91..bdb9c2717d 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -153,6 +153,7 @@ static int cciss_open(struct inode *inode, struct file *filep);
 static int cciss_release(struct inode *inode, struct file *filep);
 static int cciss_ioctl(struct inode *inode, struct file *filep, 
 		unsigned int cmd, unsigned long arg);
+static int cciss_getgeo(struct block_device *bdev, struct hd_geometry *geo);
 
 static int revalidate_allvol(ctlr_info_t *host);
 static int cciss_revalidate(struct gendisk *disk);
@@ -194,6 +195,7 @@ static struct block_device_operations cciss_fops  = {
 	.open		= cciss_open, 
 	.release       	= cciss_release,
         .ioctl		= cciss_ioctl,
+        .getgeo		= cciss_getgeo,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl   = cciss_compat_ioctl,
 #endif
@@ -633,6 +635,20 @@ static int cciss_ioctl32_big_passthru(struct file *file, unsigned cmd, unsigned
 	return err;
 }
 #endif
+
+static int cciss_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	drive_info_struct *drv = get_drv(bdev->bd_disk);
+
+	if (!drv->cylinders)
+		return -ENXIO;
+
+	geo->heads = drv->heads;
+	geo->sectors = drv->sectors;
+	geo->cylinders = drv->cylinders;
+	return 0;
+}
+
 /*
  * ioctl 
  */
@@ -651,21 +667,6 @@ static int cciss_ioctl(struct inode *inode, struct file *filep,
 #endif /* CCISS_DEBUG */ 
 	
 	switch(cmd) {
-	case HDIO_GETGEO:
-	{
-                struct hd_geometry driver_geo;
-                if (drv->cylinders) {
-                        driver_geo.heads = drv->heads;
-                        driver_geo.sectors = drv->sectors;
-                        driver_geo.cylinders = drv->cylinders;
-                } else
-			return -ENXIO;
-                driver_geo.start= get_start_sect(inode->i_bdev);
-                if (copy_to_user(argp, &driver_geo, sizeof(struct hd_geometry)))
-                        return  -EFAULT;
-                return(0);
-	}
-
 	case CCISS_GETPCIINFO:
 	{
 		cciss_pci_info_struct pciinfo;
diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c
index 9bddb68748..9f0664dd38 100644
--- a/drivers/block/cpqarray.c
+++ b/drivers/block/cpqarray.c
@@ -160,6 +160,7 @@ static int sendcmd(
 static int ida_open(struct inode *inode, struct file *filep);
 static int ida_release(struct inode *inode, struct file *filep);
 static int ida_ioctl(struct inode *inode, struct file *filep, unsigned int cmd, unsigned long arg);
+static int ida_getgeo(struct block_device *bdev, struct hd_geometry *geo);
 static int ida_ctlr_ioctl(ctlr_info_t *h, int dsk, ida_ioctl_t *io);
 
 static void do_ida_request(request_queue_t *q);
@@ -199,6 +200,7 @@ static struct block_device_operations ida_fops  = {
 	.open		= ida_open,
 	.release	= ida_release,
 	.ioctl		= ida_ioctl,
+	.getgeo		= ida_getgeo,
 	.revalidate_disk= ida_revalidate,
 };
 
@@ -1124,6 +1126,23 @@ static void ida_timer(unsigned long tdata)
 	h->misc_tflags = 0;
 }
 
+static int ida_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	drv_info_t *drv = get_drv(bdev->bd_disk);
+
+	if (drv->cylinders) {
+		geo->heads = drv->heads;
+		geo->sectors = drv->sectors;
+		geo->cylinders = drv->cylinders;
+	} else {
+		geo->heads = 0xff;
+		geo->sectors = 0x3f;
+		geo->cylinders = drv->nr_blks / (0xff*0x3f);
+	}
+
+	return 0;
+}
+
 /*
  *  ida_ioctl does some miscellaneous stuff like reporting drive geometry,
  *  setting readahead and submitting commands from userspace to the controller.
@@ -1133,27 +1152,10 @@ static int ida_ioctl(struct inode *inode, struct file *filep, unsigned int cmd,
 	drv_info_t *drv = get_drv(inode->i_bdev->bd_disk);
 	ctlr_info_t *host = get_host(inode->i_bdev->bd_disk);
 	int error;
-	int diskinfo[4];
-	struct hd_geometry __user *geo = (struct hd_geometry __user *)arg;
 	ida_ioctl_t __user *io = (ida_ioctl_t __user *)arg;
 	ida_ioctl_t *my_io;
 
 	switch(cmd) {
-	case HDIO_GETGEO:
-		if (drv->cylinders) {
-			diskinfo[0] = drv->heads;
-			diskinfo[1] = drv->sectors;
-			diskinfo[2] = drv->cylinders;
-		} else {
-			diskinfo[0] = 0xff;
-			diskinfo[1] = 0x3f;
-			diskinfo[2] = drv->nr_blks / (0xff*0x3f);
-		}
-		put_user(diskinfo[0], &geo->heads);
-		put_user(diskinfo[1], &geo->sectors);
-		put_user(diskinfo[2], &geo->cylinders);
-		put_user(get_start_sect(inode->i_bdev), &geo->start);
-		return 0;
 	case IDAGETDRVINFO:
 		if (copy_to_user(&io->c.drv, drv, sizeof(drv_info_t)))
 			return -EFAULT;
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index a5b857c5c4..b86613b21c 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -3445,6 +3445,23 @@ static int get_floppy_geometry(int drive, int type, struct floppy_struct **g)
 	return 0;
 }
 
+static int fd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	int drive = (long)bdev->bd_disk->private_data;
+	int type = ITYPE(drive_state[drive].fd_device);
+	struct floppy_struct *g;
+	int ret;
+
+	ret = get_floppy_geometry(drive, type, &g);
+	if (ret)
+		return ret;
+
+	geo->heads = g->head;
+	geo->sectors = g->sect;
+	geo->cylinders = g->track;
+	return 0;
+}
+
 static int fd_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
 		    unsigned long param)
 {
@@ -3474,23 +3491,6 @@ static int fd_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
 		cmd = FDEJECT;
 	}
 
-	/* generic block device ioctls */
-	switch (cmd) {
-		/* the following have been inspired by the corresponding
-		 * code for other block devices. */
-		struct floppy_struct *g;
-	case HDIO_GETGEO:
-		{
-			struct hd_geometry loc;
-			ECALL(get_floppy_geometry(drive, type, &g));
-			loc.heads = g->head;
-			loc.sectors = g->sect;
-			loc.cylinders = g->track;
-			loc.start = 0;
-			return _COPYOUT(loc);
-		}
-	}
-
 	/* convert the old style command into a new style command */
 	if ((cmd & 0xff00) == 0x0200) {
 		ECALL(normalize_ioctl(&cmd, &size));
@@ -3938,6 +3938,7 @@ static struct block_device_operations floppy_fops = {
 	.open		= floppy_open,
 	.release	= floppy_release,
 	.ioctl		= fd_ioctl,
+	.getgeo		= fd_getgeo,
 	.media_changed	= check_floppy_change,
 	.revalidate_disk = floppy_revalidate,
 };
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index fa49d62626..62d2464c12 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -747,32 +747,33 @@ static int pd_open(struct inode *inode, struct file *file)
 	return 0;
 }
 
+static int pd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	struct pd_unit *disk = bdev->bd_disk->private_data;
+
+	if (disk->alt_geom) {
+		geo->heads = PD_LOG_HEADS;
+		geo->sectors = PD_LOG_SECTS;
+		geo->cylinders = disk->capacity / (geo->heads * geo->sectors);
+	} else {
+		geo->heads = disk->heads;
+		geo->sectors = disk->sectors;
+		geo->cylinders = disk->cylinders;
+	}
+
+	return 0;
+}
+
 static int pd_ioctl(struct inode *inode, struct file *file,
 	 unsigned int cmd, unsigned long arg)
 {
 	struct pd_unit *disk = inode->i_bdev->bd_disk->private_data;
-	struct hd_geometry __user *geo = (struct hd_geometry __user *) arg;
-	struct hd_geometry g;
 
 	switch (cmd) {
 	case CDROMEJECT:
 		if (disk->access == 1)
 			pd_special_command(disk, pd_eject);
 		return 0;
-	case HDIO_GETGEO:
-		if (disk->alt_geom) {
-			g.heads = PD_LOG_HEADS;
-			g.sectors = PD_LOG_SECTS;
-			g.cylinders = disk->capacity / (g.heads * g.sectors);
-		} else {
-			g.heads = disk->heads;
-			g.sectors = disk->sectors;
-			g.cylinders = disk->cylinders;
-		}
-		g.start = get_start_sect(inode->i_bdev);
-		if (copy_to_user(geo, &g, sizeof(struct hd_geometry)))
-			return -EFAULT;
-		return 0;
 	default:
 		return -EINVAL;
 	}
@@ -815,6 +816,7 @@ static struct block_device_operations pd_fops = {
 	.open		= pd_open,
 	.release	= pd_release,
 	.ioctl		= pd_ioctl,
+	.getgeo		= pd_getgeo,
 	.media_changed	= pd_check_media,
 	.revalidate_disk= pd_revalidate
 };
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c
index e9746af29b..852b564e90 100644
--- a/drivers/block/paride/pf.c
+++ b/drivers/block/paride/pf.c
@@ -205,6 +205,7 @@ static int pf_open(struct inode *inode, struct file *file);
 static void do_pf_request(request_queue_t * q);
 static int pf_ioctl(struct inode *inode, struct file *file,
 		    unsigned int cmd, unsigned long arg);
+static int pf_getgeo(struct block_device *bdev, struct hd_geometry *geo);
 
 static int pf_release(struct inode *inode, struct file *file);
 
@@ -266,6 +267,7 @@ static struct block_device_operations pf_fops = {
 	.open		= pf_open,
 	.release	= pf_release,
 	.ioctl		= pf_ioctl,
+	.getgeo		= pf_getgeo,
 	.media_changed	= pf_check_media,
 };
 
@@ -313,34 +315,34 @@ static int pf_open(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static int pf_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg)
+static int pf_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
-	struct pf_unit *pf = inode->i_bdev->bd_disk->private_data;
-	struct hd_geometry __user *geo = (struct hd_geometry __user *) arg;
-	struct hd_geometry g;
-	sector_t capacity;
-
-	if (cmd == CDROMEJECT) {
-		if (pf->access == 1) {
-			pf_eject(pf);
-			return 0;
-		}
-		return -EBUSY;
-	}
-	if (cmd != HDIO_GETGEO)
-		return -EINVAL;
-	capacity = get_capacity(pf->disk);
+	struct pf_unit *pf = bdev->bd_disk->private_data;
+	sector_t capacity = get_capacity(pf->disk);
+
 	if (capacity < PF_FD_MAX) {
-		g.cylinders = sector_div(capacity, PF_FD_HDS * PF_FD_SPT);
-		g.heads = PF_FD_HDS;
-		g.sectors = PF_FD_SPT;
+		geo->cylinders = sector_div(capacity, PF_FD_HDS * PF_FD_SPT);
+		geo->heads = PF_FD_HDS;
+		geo->sectors = PF_FD_SPT;
 	} else {
-		g.cylinders = sector_div(capacity, PF_HD_HDS * PF_HD_SPT);
-		g.heads = PF_HD_HDS;
-		g.sectors = PF_HD_SPT;
+		geo->cylinders = sector_div(capacity, PF_HD_HDS * PF_HD_SPT);
+		geo->heads = PF_HD_HDS;
+		geo->sectors = PF_HD_SPT;
 	}
-	if (copy_to_user(geo, &g, sizeof(g)))
-		return -EFAULT;
+
+	return 0;
+}
+
+static int pf_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct pf_unit *pf = inode->i_bdev->bd_disk->private_data;
+
+	if (cmd != CDROMEJECT)
+		return -EINVAL;
+
+	if (pf->access != 1)
+		return -EBUSY;
+	pf_eject(pf);
 	return 0;
 }
 
diff --git a/drivers/block/ps2esdi.c b/drivers/block/ps2esdi.c
index 29d1518be7..43415f6983 100644
--- a/drivers/block/ps2esdi.c
+++ b/drivers/block/ps2esdi.c
@@ -81,8 +81,7 @@ static void (*current_int_handler) (u_int) = NULL;
 static void ps2esdi_normal_interrupt_handler(u_int);
 static void ps2esdi_initial_reset_int_handler(u_int);
 static void ps2esdi_geometry_int_handler(u_int);
-static int ps2esdi_ioctl(struct inode *inode, struct file *file,
-			 u_int cmd, u_long arg);
+static int ps2esdi_getgeo(struct block_device *bdev, struct hd_geometry *geo);
 
 static int ps2esdi_read_status_words(int num_words, int max_words, u_short * buffer);
 
@@ -132,7 +131,7 @@ static struct ps2esdi_i_struct ps2esdi_info[MAX_HD] =
 static struct block_device_operations ps2esdi_fops =
 {
 	.owner		= THIS_MODULE,
-	.ioctl		= ps2esdi_ioctl,
+	.getgeo		= ps2esdi_getgeo,
 };
 
 static struct gendisk *ps2esdi_gendisk[2];
@@ -1058,21 +1057,13 @@ static void dump_cmd_complete_status(u_int int_ret_code)
 
 }
 
-static int ps2esdi_ioctl(struct inode *inode,
-			 struct file *file, u_int cmd, u_long arg)
+static int ps2esdi_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
-	struct ps2esdi_i_struct *p = inode->i_bdev->bd_disk->private_data;
-	struct ps2esdi_geometry geom;
-
-	if (cmd != HDIO_GETGEO)
-		return -EINVAL;
-	memset(&geom, 0, sizeof(geom));
-	geom.heads = p->head;
-	geom.sectors = p->sect;
-	geom.cylinders = p->cyl;
-	geom.start = get_start_sect(inode->i_bdev);
-	if (copy_to_user((void __user *)arg, &geom, sizeof(geom)))
-		return -EFAULT;
+	struct ps2esdi_i_struct *p = bdev->bd_disk->private_data;
+
+	geo->heads = p->head;
+	geo->sectors = p->sect;
+	geo->cylinders = p->cyl;
 	return 0;
 }
 
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index 9251f4131b..c0cdc182a8 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -407,8 +407,7 @@ struct carm_array_info {
 
 static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent);
 static void carm_remove_one (struct pci_dev *pdev);
-static int carm_bdev_ioctl(struct inode *ino, struct file *fil,
-			   unsigned int cmd, unsigned long arg);
+static int carm_bdev_getgeo(struct block_device *bdev, struct hd_geometry *geo);
 
 static struct pci_device_id carm_pci_tbl[] = {
 	{ PCI_VENDOR_ID_PROMISE, 0x8000, PCI_ANY_ID, PCI_ANY_ID, 0, 0, },
@@ -426,7 +425,7 @@ static struct pci_driver carm_driver = {
 
 static struct block_device_operations carm_bd_ops = {
 	.owner		= THIS_MODULE,
-	.ioctl		= carm_bdev_ioctl,
+	.getgeo		= carm_bdev_getgeo,
 };
 
 static unsigned int carm_host_id;
@@ -434,32 +433,14 @@ static unsigned long carm_major_alloc;
 
 
-static int carm_bdev_ioctl(struct inode *ino, struct file *fil,
-			   unsigned int cmd, unsigned long arg)
+static int carm_bdev_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
-	void __user *usermem = (void __user *) arg;
-	struct carm_port *port = ino->i_bdev->bd_disk->private_data;
-	struct hd_geometry geom;
+	struct carm_port *port = bdev->bd_disk->private_data;
 
-	switch (cmd) {
-	case HDIO_GETGEO:
-		if (!usermem)
-			return -EINVAL;
-
-		geom.heads = (u8) port->dev_geom_head;
-		geom.sectors = (u8) port->dev_geom_sect;
-		geom.cylinders = port->dev_geom_cyl;
-		geom.start = get_start_sect(ino->i_bdev);
-
-		if (copy_to_user(usermem, &geom, sizeof(geom)))
-			return -EFAULT;
-		return 0;
-
-	default:
-		break;
-	}
-
-	return -EOPNOTSUPP;
+	geo->heads = (u8) port->dev_geom_head;
+	geo->sectors = (u8) port->dev_geom_sect;
+	geo->cylinders = port->dev_geom_cyl;
+	return 0;
 }
 
 static const u32 msg_sizes[] = { 32, 64, 128, CARM_MSG_SIZE };
diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index 0f48301342..15299e7a1a 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -809,34 +809,23 @@ static int mm_revalidate(struct gendisk *disk)
 	set_capacity(disk, card->mm_size << 1);
 	return 0;
 }
-/*
------------------------------------------------------------------------------------
---                            mm_ioctl
------------------------------------------------------------------------------------
-*/
-static int mm_ioctl(struct inode *i, struct file *f, unsigned int cmd, unsigned long arg)
+
+static int mm_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
-	if (cmd == HDIO_GETGEO) {
-		struct cardinfo *card = i->i_bdev->bd_disk->private_data;
-		int size = card->mm_size * (1024 / MM_HARDSECT);
-		struct hd_geometry geo;
-		/*
-		 * get geometry: we have to fake one...  trim the size to a
-		 * multiple of 2048 (1M): tell we have 32 sectors, 64 heads,
-		 * whatever cylinders.
-		 */
-		geo.heads     = 64;
-		geo.sectors   = 32;
-		geo.start     = get_start_sect(i->i_bdev);
-		geo.cylinders = size / (geo.heads * geo.sectors);
-
-		if (copy_to_user((void __user *) arg, &geo, sizeof(geo)))
-			return -EFAULT;
-		return 0;
-	}
+	struct cardinfo *card = bdev->bd_disk->private_data;
+	int size = card->mm_size * (1024 / MM_HARDSECT);
 
-	return -EINVAL;
+	/*
+	 * get geometry: we have to fake one...  trim the size to a
+	 * multiple of 2048 (1M): tell we have 32 sectors, 64 heads,
+	 * whatever cylinders.
+	 */
+	geo->heads     = 64;
+	geo->sectors   = 32;
+	geo->cylinders = size / (geo->heads * geo->sectors);
+	return 0;
 }
+
 /*
 -----------------------------------------------------------------------------------
 --                                mm_check_change
@@ -855,7 +844,7 @@ static int mm_check_change(struct gendisk *disk)
 */
 static struct block_device_operations mm_fops = {
 	.owner		= THIS_MODULE,
-	.ioctl		= mm_ioctl,
+	.getgeo		= mm_getgeo,
 	.revalidate_disk= mm_revalidate,
 	.media_changed	= mm_check_change,
 };
diff --git a/drivers/block/viodasd.c b/drivers/block/viodasd.c
index 063f0304a1..d1aaf31bd9 100644
--- a/drivers/block/viodasd.c
+++ b/drivers/block/viodasd.c
@@ -247,43 +247,17 @@ static int viodasd_release(struct inode *ino, struct file *fil)
 
 /* External ioctl entry point.
  */
-static int viodasd_ioctl(struct inode *ino, struct file *fil,
-			 unsigned int cmd, unsigned long arg)
+static int viodasd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
-	unsigned char sectors;
-	unsigned char heads;
-	unsigned short cylinders;
-	struct hd_geometry *geo;
-	struct gendisk *gendisk;
-	struct viodasd_device *d;
+	struct gendisk *disk = bdev->bd_disk;
+	struct viodasd_device *d = disk->private_data;
 
-	switch (cmd) {
-	case HDIO_GETGEO:
-		geo = (struct hd_geometry *)arg;
-		if (geo == NULL)
-			return -EINVAL;
-		if (!access_ok(VERIFY_WRITE, geo, sizeof(*geo)))
-			return -EFAULT;
-		gendisk = ino->i_bdev->bd_disk;
-		d = gendisk->private_data;
-		sectors = d->sectors;
-		if (sectors == 0)
-			sectors = 32;
-		heads = d->tracks;
-		if (heads == 0)
-			heads = 64;
-		cylinders = d->cylinders;
-		if (cylinders == 0)
-			cylinders = get_capacity(gendisk) / (sectors * heads);
-		if (__put_user(sectors, &geo->sectors) ||
-		    __put_user(heads, &geo->heads) ||
-		    __put_user(cylinders, &geo->cylinders) ||
-		    __put_user(get_start_sect(ino->i_bdev), &geo->start))
-			return -EFAULT;
-		return 0;
-	}
+	geo->sectors = d->sectors ? d->sectors : 0;
+	geo->heads = d->tracks ? d->tracks  : 64;
+	geo->cylinders = d->cylinders ? d->cylinders :
+		get_capacity(disk) / (geo->cylinders * geo->heads);
 
-	return -EINVAL;
+	return 0;
 }
 
 /*
@@ -293,7 +267,7 @@ static struct block_device_operations viodasd_fops = {
 	.owner = THIS_MODULE,
 	.open = viodasd_open,
 	.release = viodasd_release,
-	.ioctl = viodasd_ioctl,
+	.getgeo = viodasd_getgeo,
 };
 
 /*
diff --git a/drivers/block/xd.c b/drivers/block/xd.c
index 68b6d7b154..97f5dab24b 100644
--- a/drivers/block/xd.c
+++ b/drivers/block/xd.c
@@ -128,9 +128,12 @@ static DEFINE_SPINLOCK(xd_lock);
 
 static struct gendisk *xd_gendisk[2];
 
+static int xd_getgeo(struct block_device *bdev, struct hd_geometry *geo);
+
 static struct block_device_operations xd_fops = {
 	.owner	= THIS_MODULE,
 	.ioctl	= xd_ioctl,
+	.getgeo = xd_getgeo,
 };
 static DECLARE_WAIT_QUEUE_HEAD(xd_wait_int);
 static u_char xd_drives, xd_irq = 5, xd_dma = 3, xd_maxsectors;
@@ -330,22 +333,20 @@ static void do_xd_request (request_queue_t * q)
 	}
 }
 
+static int xd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	XD_INFO *p = bdev->bd_disk->private_data;
+
+	geo->heads = p->heads;
+	geo->sectors = p->sectors;
+	geo->cylinders = p->cylinders;
+	return 0;
+}
+
 /* xd_ioctl: handle device ioctl's */
 static int xd_ioctl (struct inode *inode,struct file *file,u_int cmd,u_long arg)
 {
-	XD_INFO *p = inode->i_bdev->bd_disk->private_data;
-
 	switch (cmd) {
-		case HDIO_GETGEO:
-		{
-			struct hd_geometry g;
-			struct hd_geometry __user *geom= (void __user *)arg;
-			g.heads = p->heads;
-			g.sectors = p->sectors;
-			g.cylinders = p->cylinders;
-			g.start = get_start_sect(inode->i_bdev);
-			return copy_to_user(geom, &g, sizeof(g)) ? -EFAULT : 0;
-		}
 		case HDIO_SET_DMA:
 			if (!capable(CAP_SYS_ADMIN)) return -EACCES;
 			if (xdc_busy) return -EBUSY;
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index 4b441720b6..cab362ea03 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -1130,6 +1130,17 @@ static int idedisk_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+static int idedisk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	struct ide_disk_obj *idkp = ide_disk_g(bdev->bd_disk);
+	ide_drive_t *drive = idkp->drive;
+
+	geo->heads = drive->bios_head;
+	geo->sectors = drive->bios_sect;
+	geo->cylinders = (u16)drive->bios_cyl; /* truncate */
+	return 0;
+}
+
 static int idedisk_ioctl(struct inode *inode, struct file *file,
 			unsigned int cmd, unsigned long arg)
 {
@@ -1164,6 +1175,7 @@ static struct block_device_operations idedisk_ops = {
 	.open		= idedisk_open,
 	.release	= idedisk_release,
 	.ioctl		= idedisk_ioctl,
+	.getgeo		= idedisk_getgeo,
 	.media_changed	= idedisk_media_changed,
 	.revalidate_disk= idedisk_revalidate_disk
 };
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index fba3fffc2d..5945f551aa 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -2031,6 +2031,17 @@ static int idefloppy_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+static int idefloppy_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	struct ide_floppy_obj *floppy = ide_floppy_g(bdev->bd_disk);
+	ide_drive_t *drive = floppy->drive;
+
+	geo->heads = drive->bios_head;
+	geo->sectors = drive->bios_sect;
+	geo->cylinders = (u16)drive->bios_cyl; /* truncate */
+	return 0;
+}
+
 static int idefloppy_ioctl(struct inode *inode, struct file *file,
 			unsigned int cmd, unsigned long arg)
 {
@@ -2120,6 +2131,7 @@ static struct block_device_operations idefloppy_ops = {
 	.open		= idefloppy_open,
 	.release	= idefloppy_release,
 	.ioctl		= idefloppy_ioctl,
+	.getgeo		= idefloppy_getgeo,
 	.media_changed	= idefloppy_media_changed,
 	.revalidate_disk= idefloppy_revalidate_disk
 };
diff --git a/drivers/ide/ide.c b/drivers/ide/ide.c
index 4b524f6b3e..b069b13b75 100644
--- a/drivers/ide/ide.c
+++ b/drivers/ide/ide.c
@@ -1278,19 +1278,6 @@ int generic_ide_ioctl(ide_drive_t *drive, struct file *file, struct block_device
 	up(&ide_setting_sem);
 
 	switch (cmd) {
-		case HDIO_GETGEO:
-		{
-			struct hd_geometry geom;
-			if (!p || (drive->media != ide_disk && drive->media != ide_floppy)) return -EINVAL;
-			geom.heads = drive->bios_head;
-			geom.sectors = drive->bios_sect;
-			geom.cylinders = (u16)drive->bios_cyl; /* truncate */
-			geom.start = get_start_sect(bdev);
-			if (copy_to_user(p, &geom, sizeof(struct hd_geometry)))
-				return -EFAULT;
-			return 0;
-		}
-
 		case HDIO_OBSOLETE_IDENTITY:
 		case HDIO_GET_IDENTITY:
 			if (bdev != bdev->bd_contains)
diff --git a/drivers/ide/legacy/hd.c b/drivers/ide/legacy/hd.c
index 242029c9c0..6439dec668 100644
--- a/drivers/ide/legacy/hd.c
+++ b/drivers/ide/legacy/hd.c
@@ -658,22 +658,14 @@ static void do_hd_request (request_queue_t * q)
 	enable_irq(HD_IRQ);
 }
 
-static int hd_ioctl(struct inode * inode, struct file * file,
-	unsigned int cmd, unsigned long arg)
+static int hd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
-	struct hd_i_struct *disk = inode->i_bdev->bd_disk->private_data;
-	struct hd_geometry __user *loc = (struct hd_geometry __user *) arg;
-	struct hd_geometry g; 
-
-	if (cmd != HDIO_GETGEO)
-		return -EINVAL;
-	if (!loc)
-		return -EINVAL;
-	g.heads = disk->head;
-	g.sectors = disk->sect;
-	g.cylinders = disk->cyl;
-	g.start = get_start_sect(inode->i_bdev);
-	return copy_to_user(loc, &g, sizeof g) ? -EFAULT : 0; 
+	struct hd_i_struct *disk = bdev->bd_disk->private_data;
+
+	geo->heads = disk->head;
+	geo->sectors = disk->sect;
+	geo->cylinders = disk->cyl;
+	return 0;
 }
 
 /*
@@ -695,7 +687,7 @@ static irqreturn_t hd_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 }
 
 static struct block_device_operations hd_fops = {
-	.ioctl =	hd_ioctl,
+	.getgeo =	hd_getgeo,
 };
 
 /*
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 1b76fb29fb..e423a16ba3 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3598,12 +3598,21 @@ static int set_disk_faulty(mddev_t *mddev, dev_t dev)
 	return 0;
 }
 
+static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	mddev_t *mddev = bdev->bd_disk->private_data;
+
+	geo->heads = 2;
+	geo->sectors = 4;
+	geo->cylinders = get_capacity(mddev->gendisk) / 8;
+	return 0;
+}
+
 static int md_ioctl(struct inode *inode, struct file *file,
 			unsigned int cmd, unsigned long arg)
 {
 	int err = 0;
 	void __user *argp = (void __user *)arg;
-	struct hd_geometry __user *loc = argp;
 	mddev_t *mddev = NULL;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -3765,24 +3774,6 @@ static int md_ioctl(struct inode *inode, struct file *file,
 	 * 4 sectors (with a BIG number of cylinders...). This drives
 	 * dosfs just mad... ;-)
 	 */
-		case HDIO_GETGEO:
-			if (!loc) {
-				err = -EINVAL;
-				goto abort_unlock;
-			}
-			err = put_user (2, (char __user *) &loc->heads);
-			if (err)
-				goto abort_unlock;
-			err = put_user (4, (char __user *) &loc->sectors);
-			if (err)
-				goto abort_unlock;
-			err = put_user(get_capacity(mddev->gendisk)/8,
-					(short __user *) &loc->cylinders);
-			if (err)
-				goto abort_unlock;
-			err = put_user (get_start_sect(inode->i_bdev),
-						(long __user *) &loc->start);
-			goto done_unlock;
 	}
 
 	/*
@@ -3911,6 +3902,7 @@ static struct block_device_operations md_fops =
 	.open		= md_open,
 	.release	= md_release,
 	.ioctl		= md_ioctl,
+	.getgeo		= md_getgeo,
 	.media_changed	= md_media_changed,
 	.revalidate_disk= md_revalidate,
 };
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index 5b1febed31..b09fb63071 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -662,6 +662,13 @@ static int i2o_block_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
+static int i2o_block_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	i2o_block_biosparam(get_capacity(bdev->bd_disk),
+			    &geo->cylinders, &geo->heads, &geo->sectors);
+	return 0;
+}
+
 /**
  *	i2o_block_ioctl - Issue device specific ioctl calls.
  *	@cmd: ioctl command
@@ -676,7 +683,6 @@ static int i2o_block_ioctl(struct inode *inode, struct file *file,
 {
 	struct gendisk *disk = inode->i_bdev->bd_disk;
 	struct i2o_block_device *dev = disk->private_data;
-	void __user *argp = (void __user *)arg;
 
 	/* Anyone capable of this syscall can do *real bad* things */
 
@@ -684,15 +690,6 @@ static int i2o_block_ioctl(struct inode *inode, struct file *file,
 		return -EPERM;
 
 	switch (cmd) {
-	case HDIO_GETGEO:
-		{
-			struct hd_geometry g;
-			i2o_block_biosparam(get_capacity(disk),
-					    &g.cylinders, &g.heads, &g.sectors);
-			g.start = get_start_sect(inode->i_bdev);
-			return copy_to_user(argp, &g, sizeof(g)) ? -EFAULT : 0;
-		}
-
 	case BLKI2OGRSTRAT:
 		return put_user(dev->rcache, (int __user *)arg);
 	case BLKI2OGWSTRAT:
@@ -962,6 +959,7 @@ static struct block_device_operations i2o_block_fops = {
 	.open = i2o_block_open,
 	.release = i2o_block_release,
 	.ioctl = i2o_block_ioctl,
+	.getgeo = i2o_block_getgeo,
 	.media_changed = i2o_block_media_changed
 };
 
diff --git a/drivers/mmc/mmc_block.c b/drivers/mmc/mmc_block.c
index 198561d217..d5f2898159 100644
--- a/drivers/mmc/mmc_block.c
+++ b/drivers/mmc/mmc_block.c
@@ -113,31 +113,18 @@ static int mmc_blk_release(struct inode *inode, struct file *filp)
 }
 
 static int
-mmc_blk_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg)
+mmc_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
-	struct block_device *bdev = inode->i_bdev;
-
-	if (cmd == HDIO_GETGEO) {
-		struct hd_geometry geo;
-
-		memset(&geo, 0, sizeof(struct hd_geometry));
-
-		geo.cylinders	= get_capacity(bdev->bd_disk) / (4 * 16);
-		geo.heads	= 4;
-		geo.sectors	= 16;
-		geo.start	= get_start_sect(bdev);
-
-		return copy_to_user((void __user *)arg, &geo, sizeof(geo))
-			? -EFAULT : 0;
-	}
-
-	return -ENOTTY;
+	geo->cylinders = get_capacity(bdev->bd_disk) / (4 * 16);
+	geo->heads = 4;
+	geo->sectors = 16;
+	return 0;
 }
 
 static struct block_device_operations mmc_bdops = {
 	.open			= mmc_blk_open,
 	.release		= mmc_blk_release,
-	.ioctl			= mmc_blk_ioctl,
+	.getgeo			= mmc_blk_getgeo,
 	.owner			= THIS_MODULE,
 };
 
diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index 339cb1218e..7f3ff500b6 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -194,6 +194,14 @@ static int blktrans_release(struct inode *i, struct file *f)
 	return ret;
 }
 
+static int blktrans_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	struct mtd_blktrans_dev *dev = bdev->bd_disk->private_data;
+
+	if (dev->tr->getgeo)
+		return dev->tr->getgeo(dev, geo);
+	return -ENOTTY;
+}
 
 static int blktrans_ioctl(struct inode *inode, struct file *file,
 			      unsigned int cmd, unsigned long arg)
@@ -207,22 +215,6 @@ static int blktrans_ioctl(struct inode *inode, struct file *file,
 			return tr->flush(dev);
 		/* The core code did the work, we had nothing to do. */
 		return 0;
-
-	case HDIO_GETGEO:
-		if (tr->getgeo) {
-			struct hd_geometry g;
-			int ret;
-
-			memset(&g, 0, sizeof(g));
-			ret = tr->getgeo(dev, &g);
-			if (ret)
-				return ret;
-
-			g.start = get_start_sect(inode->i_bdev);
-			if (copy_to_user((void __user *)arg, &g, sizeof(g)))
-				return -EFAULT;
-			return 0;
-		} /* else */
 	default:
 		return -ENOTTY;
 	}
@@ -233,6 +225,7 @@ struct block_device_operations mtd_blktrans_ops = {
 	.open		= blktrans_open,
 	.release	= blktrans_release,
 	.ioctl		= blktrans_ioctl,
+	.getgeo		= blktrans_getgeo,
 };
 
 int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new)
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index f779f674df..2472fa1a1b 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -18,6 +18,7 @@
 #include <linux/major.h>
 #include <linux/slab.h>
 #include <linux/buffer_head.h>
+#include <linux/hdreg.h>
 
 #include <asm/ccwdev.h>
 #include <asm/ebcdic.h>
@@ -1723,12 +1724,34 @@ dasd_release(struct inode *inp, struct file *filp)
 	return 0;
 }
 
+/*
+ * Return disk geometry.
+ */
+static int
+dasd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	struct dasd_device *device;
+
+	device = bdev->bd_disk->private_data;
+	if (!device)
+		return -ENODEV;
+
+	if (!device->discipline ||
+	    !device->discipline->fill_geometry)
+		return -EINVAL;
+
+	device->discipline->fill_geometry(device, geo);
+	geo->start = get_start_sect(bdev) >> device->s2b_shift;
+	return 0;
+}
+
 struct block_device_operations
 dasd_device_operations = {
 	.owner		= THIS_MODULE,
 	.open		= dasd_open,
 	.release	= dasd_release,
 	.ioctl		= dasd_ioctl,
+	.getgeo		= dasd_getgeo,
 };
 
 
diff --git a/drivers/s390/block/dasd_ioctl.c b/drivers/s390/block/dasd_ioctl.c
index 044b753719..8e4dcd5859 100644
--- a/drivers/s390/block/dasd_ioctl.c
+++ b/drivers/s390/block/dasd_ioctl.c
@@ -485,33 +485,6 @@ dasd_ioctl_set_ro(struct block_device *bdev, int no, long args)
 	return rc;
 }
 
-/*
- * Return disk geometry.
- */
-static int
-dasd_ioctl_getgeo(struct block_device *bdev, int no, long args)
-{
-	struct hd_geometry geo = { 0, };
-	struct dasd_device *device;
-
-	device =  bdev->bd_disk->private_data;
-	if (device == NULL)
-		return -ENODEV;
-
-	if (device == NULL || device->discipline == NULL ||
-	    device->discipline->fill_geometry == NULL)
-		return -EINVAL;
-
-	geo = (struct hd_geometry) {};
-	device->discipline->fill_geometry(device, &geo);
-	geo.start = get_start_sect(bdev) >> device->s2b_shift;
-	if (copy_to_user((struct hd_geometry __user *) args, &geo,
-			 sizeof (struct hd_geometry)))
-		return -EFAULT;
-
-	return 0;
-}
-
 /*
  * List of static ioctls.
  */
@@ -528,7 +501,6 @@ static struct { int no; dasd_ioctl_fn_t fn; } dasd_ioctls[] =
 	{ BIODASDPRRST, dasd_ioctl_reset_profile },
 	{ BLKROSET, dasd_ioctl_set_ro },
 	{ DASDAPIVER, dasd_ioctl_api_version },
-	{ HDIO_GETGEO, dasd_ioctl_getgeo },
 	{ -1, NULL }
 };
 
diff --git a/drivers/s390/block/xpram.c b/drivers/s390/block/xpram.c
index bf3a67c3cc..54ecd548c3 100644
--- a/drivers/s390/block/xpram.c
+++ b/drivers/s390/block/xpram.c
@@ -328,31 +328,27 @@ fail:
 	return 0;
 }
 
-static int xpram_ioctl (struct inode *inode, struct file *filp,
-		 unsigned int cmd, unsigned long arg)
+static int xpram_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
-	struct hd_geometry __user *geo;
 	unsigned long size;
-	if (cmd != HDIO_GETGEO)
-		return -EINVAL;
+
 	/*
 	 * get geometry: we have to fake one...  trim the size to a
 	 * multiple of 64 (32k): tell we have 16 sectors, 4 heads,
 	 * whatever cylinders. Tell also that data starts at sector. 4.
 	 */
-	geo = (struct hd_geometry __user *) arg;
 	size = (xpram_pages * 8) & ~0x3f;
-	put_user(size >> 6, &geo->cylinders);
-	put_user(4, &geo->heads);
-	put_user(16, &geo->sectors);
-	put_user(4, &geo->start);
+	geo->cylinders = size >> 6;
+	geo->heads = 4;
+	geo->sectors = 16;
+	geo->start = 4;
 	return 0;
 }
 
 static struct block_device_operations xpram_devops =
 {
 	.owner	= THIS_MODULE,
-	.ioctl	= xpram_ioctl,
+	.getgeo	= xpram_getgeo,
 };
 
 /*
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 32d4d8d7b9..4c5127ed37 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -527,7 +527,7 @@ static int sd_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
-static int sd_hdio_getgeo(struct block_device *bdev, struct hd_geometry __user *loc)
+static int sd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
 	struct scsi_disk *sdkp = scsi_disk(bdev->bd_disk);
 	struct scsi_device *sdp = sdkp->device;
@@ -545,15 +545,9 @@ static int sd_hdio_getgeo(struct block_device *bdev, struct hd_geometry __user *
 	else
 		scsicam_bios_param(bdev, sdkp->capacity, diskinfo);
 
-	if (put_user(diskinfo[0], &loc->heads))
-		return -EFAULT;
-	if (put_user(diskinfo[1], &loc->sectors))
-		return -EFAULT;
-	if (put_user(diskinfo[2], &loc->cylinders))
-		return -EFAULT;
-	if (put_user((unsigned)get_start_sect(bdev),
-	             (unsigned long __user *)&loc->start))
-		return -EFAULT;
+	geo->heads = diskinfo[0];
+	geo->sectors = diskinfo[1];
+	geo->cylinders = diskinfo[2];
 	return 0;
 }
 
@@ -593,12 +587,6 @@ static int sd_ioctl(struct inode * inode, struct file * filp,
 	if (!scsi_block_when_processing_errors(sdp) || !error)
 		return error;
 
-	if (cmd == HDIO_GETGEO) {
-		if (!arg)
-			return -EINVAL;
-		return sd_hdio_getgeo(bdev, p);
-	}
-
 	/*
 	 * Send SCSI addressing ioctls directly to mid level, send other
 	 * ioctls to block level and then onto mid level if they can't be
@@ -800,6 +788,7 @@ static struct block_device_operations sd_fops = {
 	.open			= sd_open,
 	.release		= sd_release,
 	.ioctl			= sd_ioctl,
+	.getgeo			= sd_getgeo,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl		= sd_compat_ioctl,
 #endif
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 74c01aabd4..7f140480c6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -224,6 +224,7 @@ extern int dir_notify_enable;
 #include <asm/semaphore.h>
 #include <asm/byteorder.h>
 
+struct hd_geometry;
 struct iovec;
 struct nameidata;
 struct kiocb;
@@ -962,6 +963,7 @@ struct block_device_operations {
 	int (*direct_access) (struct block_device *, sector_t, unsigned long *);
 	int (*media_changed) (struct gendisk *);
 	int (*revalidate_disk) (struct gendisk *);
+	int (*getgeo)(struct block_device *, struct hd_geometry *);
 	struct module *owner;
 };
 
-- 
cgit v1.2.2


From 9fa37fd1627ec804e57ae0388555719b03b39f20 Mon Sep 17 00:00:00 2001
From: "taneli.vahakangas@netsonic.fi" <taneli.vahakangas@netsonic.fi>
Date: Sun, 8 Jan 2006 01:03:02 -0800
Subject: [PATCH] nbd: remove duplicate assignment

      <stuartm@connecttech.com>

Sent by Paul Clements <paul.clements@steeleye.com>, who needs to read
Documentation/SubmittingPatches..

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/block/nbd.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 33d6f237b2..6997d8e6bf 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -174,7 +174,6 @@ static int sock_xmit(struct socket *sock, int send, void *buf, int size,
 		msg.msg_namelen = 0;
 		msg.msg_control = NULL;
 		msg.msg_controllen = 0;
-		msg.msg_namelen = 0;
 		msg.msg_flags = msg_flags | MSG_NOSIGNAL;
 
 		if (send)
-- 
cgit v1.2.2


From 676121fcb66c861804e38d94214fd5670a1ef595 Mon Sep 17 00:00:00 2001
From: Ben Collins <bcollins@debian.org>
Date: Sun, 8 Jan 2006 01:03:04 -0800
Subject: [PATCH] Unchecked alloc_percpu() return in __create_workqueue()

__create_workqueue() not checking return of alloc_percpu()

NULL dereference was possible.

Signed-off-by: Ben Collins <bcollins@ubuntu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/workqueue.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 62d4722069..e72fb6478d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -315,6 +315,11 @@ struct workqueue_struct *__create_workqueue(const char *name,
 		return NULL;
 
 	wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct);
+	if (!wq->cpu_wq) {
+		kfree(wq);
+		return NULL;
+	}
+
 	wq->name = name;
 	/* We don't need the distraction of CPUs appearing and vanishing. */
 	lock_cpu_hotplug();
-- 
cgit v1.2.2


From 54b21a7992a31d30c9a91f7e0a00ffdb4bd0caee Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 8 Jan 2006 01:03:05 -0800
Subject: [PATCH] fix possible PAGE_CACHE_SHIFT overflows

We've had two instances recently of overflows when doing

	64_bit_value = (32_bit_value << PAGE_CACHE_SHIFT)

I did a tree-wide grep of `<<.*PAGE_CACHE_SHIFT' and this is the result.

- afs_rxfs_fetch_descriptor.offset is of type off_t, which seems broken.

- jfs and jffs are limited to 4GB anyway.

- reiserfs map_block_for_writepage() takes an unsigned long for the block -
  it should take sector_t.  (It'll fail for huge filesystems with
  blocksize<PAGE_CACHE_SIZE)

- cramfs_read() needs to use sector_t (I think cramsfs is busted on large
  filesystems anyway)

- affs is limited in file size anyway.

- I generally didn't fix 32-bit overflows in directory operations.

- arm's __flush_dcache_page() is peculiar.  What if the page lies beyond 4G?

- gss_wrap_req_priv() needs checking (snd_buf->page_base)

Cc: Oleg Drokin <green@linuxhacker.ru>
Cc: David Howells <dhowells@redhat.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: <reiserfs-dev@namesys.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Anton Altaparmakov <aia21@cantab.net>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Cc: Roman Zippel <zippel@linux-m68k.org>
Cc: <linux-fsdevel@vger.kernel.org>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/afs/dir.c             | 2 +-
 fs/buffer.c              | 6 +++---
 fs/freevxfs/vxfs_immed.c | 4 ++--
 fs/jffs/inode-v23.c      | 4 ++--
 fs/mpage.c               | 4 ++--
 fs/romfs/inode.c         | 6 +++---
 fs/smbfs/file.c          | 4 ++--
 fs/sysv/dir.c            | 4 ++--
 8 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 6682d6d7f2..5c61c24dab 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -137,7 +137,7 @@ static inline void afs_dir_check_page(struct inode *dir, struct page *page)
 #endif
 
 	/* determine how many magic numbers there should be in this page */
-	latter = dir->i_size - (page->index << PAGE_CACHE_SHIFT);
+	latter = dir->i_size - page_offset(page);
 	if (latter >= PAGE_SIZE)
 		qty = PAGE_SIZE;
 	else
diff --git a/fs/buffer.c b/fs/buffer.c
index 263df0f192..55f0975a9b 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1762,7 +1762,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
 	 * handle that here by just cleaning them.
 	 */
 
-	block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
 	head = page_buffers(page);
 	bh = head;
 
@@ -2635,7 +2635,7 @@ int block_truncate_page(struct address_space *mapping,
 	pgoff_t index = from >> PAGE_CACHE_SHIFT;
 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
 	unsigned blocksize;
-	pgoff_t iblock;
+	sector_t iblock;
 	unsigned length, pos;
 	struct inode *inode = mapping->host;
 	struct page *page;
@@ -2651,7 +2651,7 @@ int block_truncate_page(struct address_space *mapping,
 		return 0;
 
 	length = blocksize - length;
-	iblock = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
 	
 	page = grab_cache_page(mapping, index);
 	err = -ENOMEM;
diff --git a/fs/freevxfs/vxfs_immed.c b/fs/freevxfs/vxfs_immed.c
index d0401dc68d..6f5df1700e 100644
--- a/fs/freevxfs/vxfs_immed.c
+++ b/fs/freevxfs/vxfs_immed.c
@@ -99,8 +99,8 @@ static int
 vxfs_immed_readpage(struct file *fp, struct page *pp)
 {
 	struct vxfs_inode_info	*vip = VXFS_INO(pp->mapping->host);
-	u_int64_t		offset = pp->index << PAGE_CACHE_SHIFT;
-	caddr_t			kaddr;
+	u_int64_t	offset = (u_int64_t)pp->index << PAGE_CACHE_SHIFT;
+	caddr_t		kaddr;
 
 	kaddr = kmap(pp);
 	memcpy(kaddr, vip->vii_immed.vi_immed + offset, PAGE_CACHE_SIZE);
diff --git a/fs/jffs/inode-v23.c b/fs/jffs/inode-v23.c
index 3dcc6d2162..2559ee10be 100644
--- a/fs/jffs/inode-v23.c
+++ b/fs/jffs/inode-v23.c
@@ -757,7 +757,7 @@ jffs_do_readpage_nolock(struct file *file, struct page *page)
 
 	read_len = 0;
 	result = 0;
-	offset = page->index << PAGE_CACHE_SHIFT;
+	offset = page_offset(page);
 
 	kmap(page);
 	buf = page_address(page);
@@ -1545,7 +1545,7 @@ jffs_commit_write(struct file *filp, struct page *page,
 {
        void *addr = page_address(page) + from;
        /* XXX: PAGE_CACHE_SHIFT or PAGE_SHIFT */
-       loff_t pos = (page->index<<PAGE_CACHE_SHIFT) + from;
+       loff_t pos = page_offset(page) + from;
 
        return jffs_file_write(filp, addr, to-from, &pos);
 } /* jffs_commit_write() */
diff --git a/fs/mpage.c b/fs/mpage.c
index f1d2d02bd4..e431cb3878 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -184,7 +184,7 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
 	if (page_has_buffers(page))
 		goto confused;
 
-	block_in_file = page->index << (PAGE_CACHE_SHIFT - blkbits);
+	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
 	last_block = (i_size_read(inode) + blocksize - 1) >> blkbits;
 
 	bh.b_page = page;
@@ -466,7 +466,7 @@ __mpage_writepage(struct bio *bio, struct page *page, get_block_t get_block,
 	 * The page has no buffers: map it to disk
 	 */
 	BUG_ON(!PageUptodate(page));
-	block_in_file = page->index << (PAGE_CACHE_SHIFT - blkbits);
+	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
 	last_block = (i_size - 1) >> blkbits;
 	map_bh.b_page = page;
 	for (page_block = 0; page_block < blocks_per_page; ) {
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
index c74f382dab..0a13859fd5 100644
--- a/fs/romfs/inode.c
+++ b/fs/romfs/inode.c
@@ -418,7 +418,7 @@ static int
 romfs_readpage(struct file *file, struct page * page)
 {
 	struct inode *inode = page->mapping->host;
-	unsigned long offset, avail, readlen;
+	loff_t offset, avail, readlen;
 	void *buf;
 	int result = -EIO;
 
@@ -429,8 +429,8 @@ romfs_readpage(struct file *file, struct page * page)
 		goto err_out;
 
 	/* 32 bit warning -- but not for us :) */
-	offset = page->index << PAGE_CACHE_SHIFT;
-	if (offset < inode->i_size) {
+	offset = page_offset(page);
+	if (offset < i_size_read(inode)) {
 		avail = inode->i_size-offset;
 		readlen = min_t(unsigned long, avail, PAGE_SIZE);
 		if (romfs_copyfrom(inode, buf, ROMFS_I(inode)->i_dataoffset+offset, readlen) == readlen) {
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index 3c6eb3ba71..7042e62726 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -209,8 +209,8 @@ smb_updatepage(struct file *file, struct page *page, unsigned long offset,
 {
 	struct dentry *dentry = file->f_dentry;
 
-	DEBUG1("(%s/%s %d@%ld)\n", DENTRY_PATH(dentry), 
-	       count, (page->index << PAGE_CACHE_SHIFT)+offset);
+	DEBUG1("(%s/%s %d@%lld)\n", DENTRY_PATH(dentry), count,
+		((unsigned long long)page->index << PAGE_CACHE_SHIFT) + offset);
 
 	return smb_writepage_sync(dentry->d_inode, page, offset, count);
 }
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 69a085abad..cce8b05cba 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -103,7 +103,7 @@ static int sysv_readdir(struct file * filp, void * dirent, filldir_t filldir)
 			offset = (char *)de - kaddr;
 
 			over = filldir(dirent, name, strnlen(name,SYSV_NAMELEN),
-					(n<<PAGE_CACHE_SHIFT) | offset,
+					((loff_t)n<<PAGE_CACHE_SHIFT) | offset,
 					fs16_to_cpu(SYSV_SB(sb), de->inode),
 					DT_UNKNOWN);
 			if (over) {
@@ -115,7 +115,7 @@ static int sysv_readdir(struct file * filp, void * dirent, filldir_t filldir)
 	}
 
 done:
-	filp->f_pos = (n << PAGE_CACHE_SHIFT) | offset;
+	filp->f_pos = ((loff_t)n << PAGE_CACHE_SHIFT) | offset;
 	unlock_kernel();
 	return 0;
 }
-- 
cgit v1.2.2


From 0811af28ce49fab963e3e6b8abcf8c460f971cd4 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 8 Jan 2006 01:03:09 -0800
Subject: [PATCH] kill_proc_info_as_uid: don't use hardcoded constants

Use symbolic names instead of hardcoded constants.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Harald Welte <laforge@gnumonks.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index e20724af9b..114cf9209b 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1220,8 +1220,7 @@ int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid,
 		ret = -ESRCH;
 		goto out_unlock;
 	}
-	if ((!info || ((unsigned long)info != 1 &&
-			(unsigned long)info != 2 && SI_FROMUSER(info)))
+	if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info)))
 	    && (euid != p->suid) && (euid != p->uid)
 	    && (uid != p->suid) && (uid != p->uid)) {
 		ret = -EPERM;
-- 
cgit v1.2.2


From bb6f6dbaa48c53525a7a4f9d4df719c3b0b582af Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 8 Jan 2006 01:03:13 -0800
Subject: [PATCH] do_coredump() should reset group_stop_count earlier

__group_complete_signal() sets ->group_stop_count in sig_kernel_coredump()
path and marks the target thread as ->group_exit_task.  So any thread
except group_exit_task will go to handle_group_stop()->finish_stop().

However, when group_exit_task actually starts do_coredump(), it sets
SIGNAL_GROUP_EXIT, but does not reset ->group_stop_count while killing
other threads.  If we have not yet stopped threads in the same thread
group, they all will spin in kernel mode until group_exit_task sends them
SIGKILL, because ->group_stop_count > 0 means:

	recalc_sigpending_tsk() never clears TIF_SIGPENDING

	get_signal_to_deliver() goes to handle_group_stop()

	handle_group_stop() returns when SIGNAL_GROUP_EXIT set

	syscall_exit/resume_userspace notice TIF_SIGPENDING,
	call get_signal_to_deliver() again.

So we are wasting cpu cycles, and if one of these threads is rt_task() this
may be a serious problem.

NOTE: do_coredump() holds ->mmap_sem, so not stopped threads can't escape
coredumping after clearing ->group_stop_count.

See also this thread: http://marc.theaimsgroup.com/?t=112739139900002

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/exec.c b/fs/exec.c
index 2075b674d8..fd02ea4a81 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1462,6 +1462,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 	if (!(current->signal->flags & SIGNAL_GROUP_EXIT)) {
 		current->signal->flags = SIGNAL_GROUP_EXIT;
 		current->signal->group_exit_code = exit_code;
+		current->signal->group_stop_count = 0;
 		retval = 0;
 	}
 	spin_unlock_irq(&current->sighand->siglock);
@@ -1477,7 +1478,6 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 	 * Clear any false indication of pending signals that might
 	 * be seen by the filesystem code called to write the core file.
 	 */
-	current->signal->group_stop_count = 0;
 	clear_thread_flag(TIF_SIGPENDING);
 
 	if (current->signal->rlim[RLIMIT_CORE].rlim_cur < binfmt->min_coredump)
-- 
cgit v1.2.2


From 485a6435abc3897934ce0dc530e31db93e9296a6 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 8 Jan 2006 01:03:14 -0800
Subject: [PATCH] little do_group_exit() cleanup

zap_other_threads() sets SIGNAL_GROUP_EXIT at the very start,
do_group_exit() doesn't need to do it.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel/exit.c b/kernel/exit.c
index c73a7eb26d..a80824f610 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -925,7 +925,6 @@ do_group_exit(int exit_code)
 			/* Another thread got here before we took the lock.  */
 			exit_code = sig->group_exit_code;
 		else {
-			sig->flags = SIGNAL_GROUP_EXIT;
 			sig->group_exit_code = exit_code;
 			zap_other_threads(current);
 		}
-- 
cgit v1.2.2


From 55a82ab3181be039c6440d3f2f69260ad6fe2988 Mon Sep 17 00:00:00 2001
From: Kylene Jo Hall <kjhall@us.ibm.com>
Date: Sun, 8 Jan 2006 01:03:15 -0800
Subject: [PATCH] tpm: add bios measurement log

According to the TCG specifications measurements or hashes of the BIOS code
and data are extended into TPM PCRS and a log is kept in an ACPI table of
these extensions for later validation if desired.  This patch exports the
values in the ACPI table through a security-fs seq_file.

Signed-off-by: Seiji Munetoh <munetoh@jp.ibm.com>
Signed-off-by: Stefan Berger <stefanb@us.ibm.com>
Signed-off-by: Reiner Sailer <sailer@us.ibm.com>
Signed-off-by: Kylene Hall <kjhall@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/acpi/osl.c          |   2 +
 drivers/char/tpm/Makefile   |   3 +
 drivers/char/tpm/tpm.c      |   3 +
 drivers/char/tpm/tpm.h      |  15 ++
 drivers/char/tpm/tpm_bios.c | 508 ++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 531 insertions(+)
 create mode 100644 drivers/char/tpm/tpm_bios.c

diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c
index e3cd0b1603..20c9a37643 100644
--- a/drivers/acpi/osl.c
+++ b/drivers/acpi/osl.c
@@ -204,11 +204,13 @@ acpi_os_map_memory(acpi_physical_address phys, acpi_size size,
 
 	return AE_OK;
 }
+EXPORT_SYMBOL_GPL(acpi_os_map_memory);
 
 void acpi_os_unmap_memory(void __iomem * virt, acpi_size size)
 {
 	iounmap(virt);
 }
+EXPORT_SYMBOL_GPL(acpi_os_unmap_memory);
 
 #ifdef ACPI_FUTURE_USAGE
 acpi_status
diff --git a/drivers/char/tpm/Makefile b/drivers/char/tpm/Makefile
index 2392e404e8..ba4582d160 100644
--- a/drivers/char/tpm/Makefile
+++ b/drivers/char/tpm/Makefile
@@ -2,6 +2,9 @@
 # Makefile for the kernel tpm device drivers.
 #
 obj-$(CONFIG_TCG_TPM) += tpm.o
+ifdef CONFIG_ACPI
+	obj-$(CONFIG_TCG_TPM) += tpm_bios.o
+endif
 obj-$(CONFIG_TCG_NSC) += tpm_nsc.o
 obj-$(CONFIG_TCG_ATMEL) += tpm_atmel.o
 obj-$(CONFIG_TCG_INFINEON) += tpm_infineon.o
diff --git a/drivers/char/tpm/tpm.c b/drivers/char/tpm/tpm.c
index a9be0e8eae..5a3870477e 100644
--- a/drivers/char/tpm/tpm.c
+++ b/drivers/char/tpm/tpm.c
@@ -466,6 +466,7 @@ void tpm_remove_hardware(struct device *dev)
 	kfree(chip->vendor->miscdev.name);
 
 	sysfs_remove_group(&dev->kobj, chip->vendor->attr_group);
+	tpm_bios_log_teardown(chip->bios_dir);
 
 	dev_mask[chip->dev_num / TPM_NUM_MASK_ENTRIES ] &=
 		~(1 << (chip->dev_num % TPM_NUM_MASK_ENTRIES));
@@ -593,6 +594,8 @@ dev_num_search_complete:
 
 	sysfs_create_group(&dev->kobj, chip->vendor->attr_group);
 
+	chip->bios_dir = tpm_bios_log_setup(devname);
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(tpm_register_hardware);
diff --git a/drivers/char/tpm/tpm.h b/drivers/char/tpm/tpm.h
index 159882ca69..fd3a4beaa5 100644
--- a/drivers/char/tpm/tpm.h
+++ b/drivers/char/tpm/tpm.h
@@ -82,6 +82,8 @@ struct tpm_chip {
 
 	struct tpm_vendor_specific *vendor;
 
+	struct dentry **bios_dir;
+
 	struct list_head list;
 };
 
@@ -107,3 +109,16 @@ extern ssize_t tpm_read(struct file *, char __user *, size_t, loff_t *);
 extern void tpm_remove_hardware(struct device *);
 extern int tpm_pm_suspend(struct device *, pm_message_t);
 extern int tpm_pm_resume(struct device *);
+
+#ifdef CONFIG_ACPI
+extern struct dentry ** tpm_bios_log_setup(char *);
+extern void tpm_bios_log_teardown(struct dentry **);
+#else
+static inline struct dentry* tpm_bios_log_setup(char *name)
+{
+	return NULL;
+}
+static inline void tpm_bios_log_teardown(struct dentry **dir)
+{
+}
+#endif
diff --git a/drivers/char/tpm/tpm_bios.c b/drivers/char/tpm/tpm_bios.c
new file mode 100644
index 0000000000..5e1f4dfefa
--- /dev/null
+++ b/drivers/char/tpm/tpm_bios.c
@@ -0,0 +1,508 @@
+/*
+ * Copyright (C) 2005 IBM Corporation
+ *
+ * Authors:
+ *	Seiji Munetoh <munetoh@jp.ibm.com>
+ *	Stefan Berger <stefanb@us.ibm.com>
+ *	Reiner Sailer <sailer@watson.ibm.com>
+ *	Kylene Hall <kjhall@us.ibm.com>
+ *
+ * Access to the eventlog extended by the TCG BIOS of PC platform
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/seq_file.h>
+#include <linux/fs.h>
+#include <linux/security.h>
+#include <linux/module.h>
+#include <acpi/acpi.h>
+#include <acpi/actypes.h>
+#include <acpi/actbl.h>
+#include "tpm.h"
+
+#define TCG_EVENT_NAME_LEN_MAX	255
+#define MAX_TEXT_EVENT		1000	/* Max event string length */
+#define ACPI_TCPA_SIG		"TCPA"	/* 0x41504354 /'TCPA' */
+
+struct acpi_tcpa {
+	struct acpi_table_header hdr;
+	u16 reserved;
+	u32 log_max_len __attribute__ ((packed));
+	u32 log_start_addr __attribute__ ((packed));
+};
+
+struct tcpa_event {
+	u32 pcr_index;
+	u32 event_type;
+	u8 pcr_value[20];	/* SHA1 */
+	u32 event_size;
+	u8 event_data[0];
+};
+
+enum tcpa_event_types {
+	PREBOOT = 0,
+	POST_CODE,
+	UNUSED,
+	NO_ACTION,
+	SEPARATOR,
+	ACTION,
+	EVENT_TAG,
+	SCRTM_CONTENTS,
+	SCRTM_VERSION,
+	CPU_MICROCODE,
+	PLATFORM_CONFIG_FLAGS,
+	TABLE_OF_DEVICES,
+	COMPACT_HASH,
+	IPL,
+	IPL_PARTITION_DATA,
+	NONHOST_CODE,
+	NONHOST_CONFIG,
+	NONHOST_INFO,
+};
+
+static const char* tcpa_event_type_strings[] = {
+	"PREBOOT",
+	"POST CODE",
+	"",
+	"NO ACTION",
+	"SEPARATOR",
+	"ACTION",
+	"EVENT TAG",
+	"S-CRTM Contents",
+	"S-CRTM Version",
+	"CPU Microcode",
+	"Platform Config Flags",
+	"Table of Devices",
+	"Compact Hash",
+	"IPL",
+	"IPL Partition Data",
+	"Non-Host Code",
+	"Non-Host Config",
+	"Non-Host Info"
+};
+
+enum tcpa_pc_event_ids {
+	SMBIOS = 1,
+	BIS_CERT,
+	POST_BIOS_ROM,
+	ESCD,
+	CMOS,
+	NVRAM,
+	OPTION_ROM_EXEC,
+	OPTION_ROM_CONFIG,
+	OPTION_ROM_MICROCODE,
+	S_CRTM_VERSION,
+	S_CRTM_CONTENTS,
+	POST_CONTENTS,
+};
+
+static const char* tcpa_pc_event_id_strings[] = {
+	""
+	"SMBIOS",
+	"BIS Certificate",
+	"POST BIOS ",
+	"ESCD ",
+	"CMOS",
+	"NVRAM",
+	"Option ROM",
+	"Option ROM config",
+	"Option ROM microcode",
+	"S-CRTM Version",
+	"S-CRTM Contents",
+	"S-CRTM POST Contents",
+};
+
+/* (Binary) bios measurement buffer */
+static void *tcg_eventlog;
+static void *tcg_eventlog_addr_limit;	/* MAX */
+
+/* returns pointer to start of pos. entry of tcg log */
+static void *tpm_bios_measurements_start(struct seq_file *m, loff_t *pos)
+{
+	loff_t i;
+	void *addr = tcg_eventlog;
+	struct tcpa_event *event;
+
+	/* read over *pos measurements */
+	for (i = 0; i < *pos; i++) {
+		event = addr;
+
+		if ((addr + sizeof(struct tcpa_event)) <
+		    tcg_eventlog_addr_limit) {
+			if (event->event_type == 0 && event->event_size == 0)
+				return NULL;
+			addr +=
+			    sizeof(struct tcpa_event) + event->event_size;
+		}
+	}
+
+	/* now check if current entry is valid */
+	if ((addr + sizeof(struct tcpa_event)) >= tcg_eventlog_addr_limit)
+		return NULL;
+
+	event = addr;
+
+	if ((event->event_type == 0 && event->event_size == 0) ||
+	    ((addr + sizeof(struct tcpa_event) + event->event_size) >=
+	     tcg_eventlog_addr_limit))
+		return NULL;
+
+	return addr;
+}
+
+static void *tpm_bios_measurements_next(struct seq_file *m, void *v,
+					loff_t *pos)
+{
+	struct tcpa_event *event = v;
+
+	v += sizeof(struct tcpa_event) + event->event_size;
+
+	/* now check if current entry is valid */
+	if ((v + sizeof(struct tcpa_event)) >= tcg_eventlog_addr_limit)
+		return NULL;
+
+	event = v;
+
+	if (event->event_type == 0 && event->event_size == 0)
+		return NULL;
+
+	if ((event->event_type == 0 && event->event_size == 0) ||
+	    ((v + sizeof(struct tcpa_event) + event->event_size) >=
+	     tcg_eventlog_addr_limit))
+		return NULL;
+
+	(*pos)++;
+	return v;
+}
+
+static void tpm_bios_measurements_stop(struct seq_file *m, void *v)
+{
+}
+
+static int get_event_name(char *dest, struct tcpa_event *event,
+			unsigned char * event_entry)
+{
+	const char *name = "";
+	char data[40] = "";
+	int i, n_len = 0, d_len = 0;
+	u32 event_id, event_data_size;
+
+	switch(event->event_type) {
+	case PREBOOT:
+	case POST_CODE:
+	case UNUSED:
+	case NO_ACTION:
+	case SCRTM_CONTENTS:
+	case SCRTM_VERSION:
+	case CPU_MICROCODE:
+	case PLATFORM_CONFIG_FLAGS:
+	case TABLE_OF_DEVICES:
+	case COMPACT_HASH:
+	case IPL:
+	case IPL_PARTITION_DATA:
+	case NONHOST_CODE:
+	case NONHOST_CONFIG:
+	case NONHOST_INFO:
+		name = tcpa_event_type_strings[event->event_type];
+		n_len = strlen(name);
+		break;
+	case SEPARATOR:
+	case ACTION:
+		if (MAX_TEXT_EVENT > event->event_size) {
+			name = event_entry;
+			n_len = event->event_size;
+		}
+		break;
+	case EVENT_TAG:
+		event_id = be32_to_cpu(event_entry);
+		event_data_size = be32_to_cpu(&event_entry[4]);
+
+		/* ToDo Row data -> Base64 */
+
+		switch (event_id) {
+		case SMBIOS:
+		case BIS_CERT:
+		case CMOS:
+		case NVRAM:
+		case OPTION_ROM_EXEC:
+		case OPTION_ROM_CONFIG:
+		case OPTION_ROM_MICROCODE:
+		case S_CRTM_VERSION:
+		case S_CRTM_CONTENTS:
+		case POST_CONTENTS:
+			name = tcpa_pc_event_id_strings[event_id];
+			n_len = strlen(name);
+			break;
+		case POST_BIOS_ROM:
+		case ESCD:
+			name = tcpa_pc_event_id_strings[event_id];
+			n_len = strlen(name);
+			for (i = 0; i < 20; i++)
+				d_len += sprintf(data, "%02x",
+						event_entry[8 + i]);
+			break;
+		default:
+			break;
+		}
+	default:
+		break;
+	}
+
+	return snprintf(dest, MAX_TEXT_EVENT, "[%.*s%.*s]",
+			n_len, name, d_len, data);
+
+}
+
+static int tpm_binary_bios_measurements_show(struct seq_file *m, void *v)
+{
+
+	char *eventname;
+	char data[4];
+	u32 help;
+	int i, len;
+	struct tcpa_event *event = (struct tcpa_event *) v;
+	unsigned char *event_entry =
+	    (unsigned char *) (v + sizeof(struct tcpa_event));
+
+	eventname = kmalloc(MAX_TEXT_EVENT, GFP_KERNEL);
+	if (!eventname) {
+		printk(KERN_ERR "%s: ERROR - No Memory for event name\n ",
+		       __func__);
+		return -ENOMEM;
+	}
+
+	/* 1st: PCR used is in little-endian format (4 bytes) */
+	help = le32_to_cpu(event->pcr_index);
+	memcpy(data, &help, 4);
+	for (i = 0; i < 4; i++)
+		seq_putc(m, data[i]);
+
+	/* 2nd: SHA1 (20 bytes) */
+	for (i = 0; i < 20; i++)
+		seq_putc(m, event->pcr_value[i]);
+
+	/* 3rd: event type identifier (4 bytes) */
+	help = le32_to_cpu(event->event_type);
+	memcpy(data, &help, 4);
+	for (i = 0; i < 4; i++)
+		seq_putc(m, data[i]);
+
+	len = 0;
+
+	len += get_event_name(eventname, event, event_entry);
+
+	/* 4th:  filename <= 255 + \'0' delimiter */
+	if (len > TCG_EVENT_NAME_LEN_MAX)
+		len = TCG_EVENT_NAME_LEN_MAX;
+
+	for (i = 0; i < len; i++)
+		seq_putc(m, eventname[i]);
+
+	/* 5th: delimiter */
+	seq_putc(m, '\0');
+
+	return 0;
+}
+
+static int tpm_bios_measurements_release(struct inode *inode,
+					 struct file *file)
+{
+	if (tcg_eventlog) {
+		kfree(tcg_eventlog);
+		tcg_eventlog = NULL;
+	}
+	return seq_release(inode, file);
+}
+
+static int tpm_ascii_bios_measurements_show(struct seq_file *m, void *v)
+{
+	int len = 0;
+	int i;
+	char *eventname;
+	struct tcpa_event *event = v;
+	unsigned char *event_entry =
+	    (unsigned char *) (v + sizeof(struct tcpa_event));
+
+	eventname = kmalloc(MAX_TEXT_EVENT, GFP_KERNEL);
+	if (!eventname) {
+		printk(KERN_ERR "%s: ERROR - No Memory for event name\n ",
+		       __func__);
+		return -EFAULT;
+	}
+
+	seq_printf(m, "%2d ", event->pcr_index);
+
+	/* 2nd: SHA1 */
+	for (i = 0; i < 20; i++)
+		seq_printf(m, "%02x", event->pcr_value[i]);
+
+	/* 3rd: event type identifier */
+	seq_printf(m, " %02x", event->event_type);
+
+	len += get_event_name(eventname, event, event_entry);
+
+	/* 4th: eventname <= max + \'0' delimiter */
+	seq_printf(m, " %s\n", eventname);
+
+	return 0;
+}
+
+static struct seq_operations tpm_ascii_b_measurments_seqops = {
+	.start = tpm_bios_measurements_start,
+	.next = tpm_bios_measurements_next,
+	.stop = tpm_bios_measurements_stop,
+	.show = tpm_ascii_bios_measurements_show,
+};
+
+static struct seq_operations tpm_binary_b_measurments_seqops = {
+	.start = tpm_bios_measurements_start,
+	.next = tpm_bios_measurements_next,
+	.stop = tpm_bios_measurements_stop,
+	.show = tpm_binary_bios_measurements_show,
+};
+
+/* read binary bios log */
+static int read_log(void)
+{
+	struct acpi_tcpa *buff;
+	acpi_status status;
+	void *virt;
+
+	if (tcg_eventlog != NULL) {
+		printk(KERN_ERR
+		       "%s: ERROR - Eventlog already initialized\n",
+		       __func__);
+		return -EFAULT;
+	}
+
+	/* Find TCPA entry in RSDT (ACPI_LOGICAL_ADDRESSING) */
+	status = acpi_get_firmware_table(ACPI_TCPA_SIG, 1,
+					 ACPI_LOGICAL_ADDRESSING,
+					 (struct acpi_table_header **)
+					 &buff);
+
+	if (ACPI_FAILURE(status)) {
+		printk(KERN_ERR "%s: ERROR - Could not get TCPA table\n",
+		       __func__);
+		return -EIO;
+	}
+
+	if (buff->log_max_len == 0) {
+		printk(KERN_ERR "%s: ERROR - TCPA log area empty\n",
+		       __func__);
+		return -EIO;
+	}
+
+	/* malloc EventLog space */
+	tcg_eventlog = kmalloc(buff->log_max_len, GFP_KERNEL);
+	if (!tcg_eventlog) {
+		printk
+		    ("%s: ERROR - Not enough  Memory for BIOS measurements\n",
+		     __func__);
+		return -ENOMEM;
+	}
+
+	tcg_eventlog_addr_limit = tcg_eventlog + buff->log_max_len;
+
+	acpi_os_map_memory(buff->log_start_addr, buff->log_max_len, &virt);
+
+	memcpy(tcg_eventlog, virt, buff->log_max_len);
+
+	acpi_os_unmap_memory(virt, buff->log_max_len);
+	return 0;
+}
+
+static int tpm_ascii_bios_measurements_open(struct inode *inode,
+					    struct file *file)
+{
+	int err;
+
+	if ((err = read_log()))
+		return err;
+
+	/* now register seq file */
+	return seq_open(file, &tpm_ascii_b_measurments_seqops);
+}
+
+struct file_operations tpm_ascii_bios_measurements_ops = {
+	.open = tpm_ascii_bios_measurements_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = tpm_bios_measurements_release,
+};
+
+static int tpm_binary_bios_measurements_open(struct inode *inode,
+					     struct file *file)
+{
+	int err;
+
+	if ((err = read_log()))
+		return err;
+
+	/* now register seq file */
+	return seq_open(file, &tpm_binary_b_measurments_seqops);
+}
+
+struct file_operations tpm_binary_bios_measurements_ops = {
+	.open = tpm_binary_bios_measurements_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = tpm_bios_measurements_release,
+};
+
+struct dentry **tpm_bios_log_setup(char *name)
+{
+	struct dentry **ret = NULL, *tpm_dir, *bin_file, *ascii_file;
+
+	tpm_dir = securityfs_create_dir(name, NULL);
+	if (!tpm_dir)
+		goto out;
+
+	bin_file =
+	    securityfs_create_file("binary_bios_measurements",
+				   S_IRUSR | S_IRGRP, tpm_dir, NULL,
+				   &tpm_binary_bios_measurements_ops);
+	if (!bin_file)
+		goto out_tpm;
+
+	ascii_file =
+	    securityfs_create_file("ascii_bios_measurements",
+				   S_IRUSR | S_IRGRP, tpm_dir, NULL,
+				   &tpm_ascii_bios_measurements_ops);
+	if (!ascii_file)
+		goto out_bin;
+
+	ret = kmalloc(3 * sizeof(struct dentry *), GFP_KERNEL);
+	if (!ret)
+		goto out_ascii;
+
+	ret[0] = ascii_file;
+	ret[1] = bin_file;
+	ret[2] = tpm_dir;
+
+	return ret;
+
+out_ascii:
+	securityfs_remove(ascii_file);
+out_bin:
+	securityfs_remove(bin_file);
+out_tpm:
+	securityfs_remove(tpm_dir);
+out:
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(tpm_bios_log_setup);
+
+void tpm_bios_log_teardown(struct dentry **lst)
+{
+	int i;
+
+	for (i = 0; i < 3; i++)
+		securityfs_remove(lst[i]);
+}
+EXPORT_SYMBOL_GPL(tpm_bios_log_teardown);
-- 
cgit v1.2.2


From c809406b4f2dfac9006d7eb8dca6b9f990f10b61 Mon Sep 17 00:00:00 2001
From: Ashok Raj <ashok.raj@intel.com>
Date: Sun, 8 Jan 2006 01:03:17 -0800
Subject: [PATCH] Updated CPU hotplug documentation

Thanks to Nathan Lynch for the review and comments.  Thanks to Joel Schopp
for the pointer to add user space scipts.

Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Nathan Lynch <nathanl@austin.ibm.com>
Signed-off-by: Joel Schopp <jschopp@austin.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/cpu-hotplug.txt | 357 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 357 insertions(+)
 create mode 100644 Documentation/cpu-hotplug.txt

diff --git a/Documentation/cpu-hotplug.txt b/Documentation/cpu-hotplug.txt
new file mode 100644
index 0000000000..08c5d04f30
--- /dev/null
+++ b/Documentation/cpu-hotplug.txt
@@ -0,0 +1,357 @@
+		CPU hotplug Support in Linux(tm) Kernel
+
+		Maintainers:
+		CPU Hotplug Core:
+			Rusty Russell <rusty@rustycorp.com.au>
+			Srivatsa Vaddagiri <vatsa@in.ibm.com>
+		i386:
+			Zwane Mwaikambo <zwane@arm.linux.org.uk>
+		ppc64:
+			Nathan Lynch <nathanl@austin.ibm.com>
+			Joel Schopp <jschopp@austin.ibm.com>
+		ia64/x86_64:
+			Ashok Raj <ashok.raj@intel.com>
+
+Authors: Ashok Raj <ashok.raj@intel.com>
+Lots of feedback: Nathan Lynch <nathanl@austin.ibm.com>,
+	     Joel Schopp <jschopp@austin.ibm.com>
+
+Introduction
+
+Modern advances in system architectures have introduced advanced error
+reporting and correction capabilities in processors. CPU architectures permit
+partitioning support, where compute resources of a single CPU could be made
+available to virtual machine environments. There are couple OEMS that
+support NUMA hardware which are hot pluggable as well, where physical
+node insertion and removal require support for CPU hotplug.
+
+Such advances require CPUs available to a kernel to be removed either for
+provisioning reasons, or for RAS purposes to keep an offending CPU off
+system execution path. Hence the need for CPU hotplug support in the
+Linux kernel.
+
+A more novel use of CPU-hotplug support is its use today in suspend
+resume support for SMP. Dual-core and HT support makes even
+a laptop run SMP kernels which didn't support these methods. SMP support
+for suspend/resume is a work in progress.
+
+General Stuff about CPU Hotplug
+--------------------------------
+
+Command Line Switches
+---------------------
+maxcpus=n    Restrict boot time cpus to n. Say if you have 4 cpus, using
+             maxcpus=2 will only boot 2. You can choose to bring the
+             other cpus later online, read FAQ's for more info.
+
+additional_cpus=n	[x86_64 only] use this to limit hotpluggable cpus.
+                        This option sets
+			cpu_possible_map = cpu_present_map + additional_cpus
+
+CPU maps and such
+-----------------
+[More on cpumaps and primitive to manipulate, please check
+include/linux/cpumask.h that has more descriptive text.]
+
+cpu_possible_map: Bitmap of possible CPUs that can ever be available in the
+system. This is used to allocate some boot time memory for per_cpu variables
+that aren't designed to grow/shrink as CPUs are made available or removed.
+Once set during boot time discovery phase, the map is static, i.e no bits
+are added or removed anytime.  Trimming it accurately for your system needs
+upfront can save some boot time memory. See below for how we use heuristics
+in x86_64 case to keep this under check.
+
+cpu_online_map: Bitmap of all CPUs currently online. Its set in __cpu_up()
+after a cpu is available for kernel scheduling and ready to receive
+interrupts from devices. Its cleared when a cpu is brought down using
+__cpu_disable(), before which all OS services including interrupts are
+migrated to another target CPU.
+
+cpu_present_map: Bitmap of CPUs currently present in the system. Not all
+of them may be online. When physical hotplug is processed by the relevant
+subsystem (e.g ACPI) can change and new bit either be added or removed
+from the map depending on the event is hot-add/hot-remove. There are currently
+no locking rules as of now. Typical usage is to init topology during boot,
+at which time hotplug is disabled.
+
+You really dont need to manipulate any of the system cpu maps. They should
+be read-only for most use. When setting up per-cpu resources almost always use
+cpu_possible_map/for_each_cpu() to iterate.
+
+Never use anything other than cpumask_t to represent bitmap of CPUs.
+
+#include <linux/cpumask.h>
+
+for_each_cpu              - Iterate over cpu_possible_map
+for_each_online_cpu       - Iterate over cpu_online_map
+for_each_present_cpu      - Iterate over cpu_present_map
+for_each_cpu_mask(x,mask) - Iterate over some random collection of cpu mask.
+
+#include <linux/cpu.h>
+lock_cpu_hotplug() and unlock_cpu_hotplug():
+
+The above calls are used to inhibit cpu hotplug operations. While holding the
+cpucontrol mutex, cpu_online_map will not change. If you merely need to avoid
+cpus going away, you could also use preempt_disable() and preempt_enable()
+for those sections. Just remember the critical section cannot call any
+function that can sleep or schedule this process away. The preempt_disable()
+will work as long as stop_machine_run() is used to take a cpu down.
+
+CPU Hotplug - Frequently Asked Questions.
+
+Q: How to i enable my kernel to support CPU hotplug?
+A: When doing make defconfig, Enable CPU hotplug support
+
+   "Processor type and Features" -> Support for Hotpluggable CPUs
+
+Make sure that you have CONFIG_HOTPLUG, and CONFIG_SMP turned on as well.
+
+You would need to enable CONFIG_HOTPLUG_CPU for SMP suspend/resume support
+as well.
+
+Q: What architectures support CPU hotplug?
+A: As of 2.6.14, the following architectures support CPU hotplug.
+
+i386 (Intel), ppc, ppc64, parisc, s390, ia64 and x86_64
+
+Q: How to test if hotplug is supported on the newly built kernel?
+A: You should now notice an entry in sysfs.
+
+Check if sysfs is mounted, using the "mount" command. You should notice
+an entry as shown below in the output.
+
+....
+none on /sys type sysfs (rw)
+....
+
+if this is not mounted, do the following.
+
+#mkdir /sysfs
+#mount -t sysfs sys /sys
+
+now you should see entries for all present cpu, the following is an example
+in a 8-way system.
+
+#pwd
+#/sys/devices/system/cpu
+#ls -l
+total 0
+drwxr-xr-x  10 root root 0 Sep 19 07:44 .
+drwxr-xr-x  13 root root 0 Sep 19 07:45 ..
+drwxr-xr-x   3 root root 0 Sep 19 07:44 cpu0
+drwxr-xr-x   3 root root 0 Sep 19 07:44 cpu1
+drwxr-xr-x   3 root root 0 Sep 19 07:44 cpu2
+drwxr-xr-x   3 root root 0 Sep 19 07:44 cpu3
+drwxr-xr-x   3 root root 0 Sep 19 07:44 cpu4
+drwxr-xr-x   3 root root 0 Sep 19 07:44 cpu5
+drwxr-xr-x   3 root root 0 Sep 19 07:44 cpu6
+drwxr-xr-x   3 root root 0 Sep 19 07:48 cpu7
+
+Under each directory you would find an "online" file which is the control
+file to logically online/offline a processor.
+
+Q: Does hot-add/hot-remove refer to physical add/remove of cpus?
+A: The usage of hot-add/remove may not be very consistently used in the code.
+CONFIG_CPU_HOTPLUG enables logical online/offline capability in the kernel.
+To support physical addition/removal, one would need some BIOS hooks and
+the platform should have something like an attention button in PCI hotplug.
+CONFIG_ACPI_HOTPLUG_CPU enables ACPI support for physical add/remove of CPUs.
+
+Q: How do i logically offline a CPU?
+A: Do the following.
+
+#echo 0 > /sys/devices/system/cpu/cpuX/online
+
+once the logical offline is successful, check
+
+#cat /proc/interrupts
+
+you should now not see the CPU that you removed. Also online file will report
+the state as 0 when a cpu if offline and 1 when its online.
+
+#To display the current cpu state.
+#cat /sys/devices/system/cpu/cpuX/online
+
+Q: Why cant i remove CPU0 on some systems?
+A: Some architectures may have some special dependency on a certain CPU.
+
+For e.g in IA64 platforms we have ability to sent platform interrupts to the
+OS. a.k.a Corrected Platform Error Interrupts (CPEI). In current ACPI
+specifications, we didn't have a way to change the target CPU. Hence if the
+current ACPI version doesn't support such re-direction, we disable that CPU
+by making it not-removable.
+
+In such cases you will also notice that the online file is missing under cpu0.
+
+Q: How do i find out if a particular CPU is not removable?
+A: Depending on the implementation, some architectures may show this by the
+absence of the "online" file. This is done if it can be determined ahead of
+time that this CPU cannot be removed.
+
+In some situations, this can be a run time check, i.e if you try to remove the
+last CPU, this will not be permitted. You can find such failures by
+investigating the return value of the "echo" command.
+
+Q: What happens when a CPU is being logically offlined?
+A: The following happen, listed in no particular order :-)
+
+- A notification is sent to in-kernel registered modules by sending an event
+  CPU_DOWN_PREPARE
+- All process is migrated away from this outgoing CPU to a new CPU
+- All interrupts targeted to this CPU is migrated to a new CPU
+- timers/bottom half/task lets are also migrated to a new CPU
+- Once all services are migrated, kernel calls an arch specific routine
+  __cpu_disable() to perform arch specific cleanup.
+- Once this is successful, an event for successful cleanup is sent by an event
+  CPU_DEAD.
+
+  "It is expected that each service cleans up when the CPU_DOWN_PREPARE
+  notifier is called, when CPU_DEAD is called its expected there is nothing
+  running on behalf of this CPU that was offlined"
+
+Q: If i have some kernel code that needs to be aware of CPU arrival and
+   departure, how to i arrange for proper notification?
+A: This is what you would need in your kernel code to receive notifications.
+
+    #include <linux/cpu.h>
+    static int __cpuinit foobar_cpu_callback(struct notifier_block *nfb,
+					    unsigned long action, void *hcpu)
+	{
+		unsigned int cpu = (unsigned long)hcpu;
+
+		switch (action) {
+		case CPU_ONLINE:
+			foobar_online_action(cpu);
+			break;
+		case CPU_DEAD:
+			foobar_dead_action(cpu);
+			break;
+		}
+		return NOTIFY_OK;
+	}
+
+	static struct notifier_block foobar_cpu_notifer =
+	{
+	   .notifier_call = foobar_cpu_callback,
+	};
+
+
+In your init function,
+
+	register_cpu_notifier(&foobar_cpu_notifier);
+
+You can fail PREPARE notifiers if something doesn't work to prepare resources.
+This will stop the activity and send a following CANCELED event back.
+
+CPU_DEAD should not be failed, its just a goodness indication, but bad
+things will happen if a notifier in path sent a BAD notify code.
+
+Q: I don't see my action being called for all CPUs already up and running?
+A: Yes, CPU notifiers are called only when new CPUs are on-lined or offlined.
+   If you need to perform some action for each cpu already in the system, then
+
+  for_each_online_cpu(i) {
+		foobar_cpu_callback(&foobar_cpu_notifier, CPU_UP_PREPARE, i);
+		foobar_cpu_callback(&foobar-cpu_notifier, CPU_ONLINE, i);
+  }
+
+Q: If i would like to develop cpu hotplug support for a new architecture,
+   what do i need at a minimum?
+A: The following are what is required for CPU hotplug infrastructure to work
+   correctly.
+
+    - Make sure you have an entry in Kconfig to enable CONFIG_HOTPLUG_CPU
+    - __cpu_up()        - Arch interface to bring up a CPU
+    - __cpu_disable()   - Arch interface to shutdown a CPU, no more interrupts
+                          can be handled by the kernel after the routine
+                          returns. Including local APIC timers etc are
+                          shutdown.
+     - __cpu_die()      - This actually supposed to ensure death of the CPU.
+                          Actually look at some example code in other arch
+                          that implement CPU hotplug. The processor is taken
+                          down from the idle() loop for that specific
+                          architecture. __cpu_die() typically waits for some
+                          per_cpu state to be set, to ensure the processor
+                          dead routine is called to be sure positively.
+
+Q: I need to ensure that a particular cpu is not removed when there is some
+   work specific to this cpu is in progress.
+A: First switch the current thread context to preferred cpu
+
+   int my_func_on_cpu(int cpu)
+   {
+       cpumask_t saved_mask, new_mask = CPU_MASK_NONE;
+       int curr_cpu, err = 0;
+
+       saved_mask = current->cpus_allowed;
+       cpu_set(cpu, new_mask);
+       err = set_cpus_allowed(current, new_mask);
+
+       if (err)
+           return err;
+
+       /*
+        * If we got scheduled out just after the return from
+        * set_cpus_allowed() before running the work, this ensures
+        * we stay locked.
+        */
+       curr_cpu = get_cpu();
+
+       if (curr_cpu != cpu) {
+	   err = -EAGAIN;
+           goto ret;
+       } else {
+       	   /*
+	    * Do work : But cant sleep, since get_cpu() disables preempt
+	    */
+       }
+    ret:
+    	put_cpu();
+	set_cpus_allowed(current, saved_mask);
+	return err;
+    }
+
+
+Q: How do we determine how many CPUs are available for hotplug.
+A: There is no clear spec defined way from ACPI that can give us that
+   information today. Based on some input from Natalie of Unisys,
+   that the ACPI MADT (Multiple APIC Description Tables) marks those possible
+   CPUs in a system with disabled status.
+
+   Andi implemented some simple heuristics that count the number of disabled
+   CPUs in MADT as hotpluggable CPUS.  In the case there are no disabled CPUS
+   we assume 1/2 the number of CPUs currently present can be hotplugged.
+
+   Caveat: Today's ACPI MADT can only provide 256 entries since the apicid field
+   in MADT is only 8 bits.
+
+User Space Notification
+
+Hotplug support for devices is common in Linux today. Its being used today to
+support automatic configuration of network, usb and pci devices. A hotplug
+event can be used to invoke an agent script to perform the configuration task.
+
+You can add /etc/hotplug/cpu.agent to handle hotplug notification user space
+scripts.
+
+	#!/bin/bash
+	# $Id: cpu.agent
+	# Kernel hotplug params include:
+	#ACTION=%s [online or offline]
+	#DEVPATH=%s
+	#
+	cd /etc/hotplug
+	. ./hotplug.functions
+
+	case $ACTION in
+		online)
+			echo `date` ":cpu.agent" add cpu >> /tmp/hotplug.txt
+			;;
+		offline)
+			echo `date` ":cpu.agent" remove cpu >>/tmp/hotplug.txt
+			;;
+		*)
+			debug_mesg CPU $ACTION event not supported
+        exit 1
+        ;;
+	esac
-- 
cgit v1.2.2


From 4a0d11fae57989e24fe2ee3eff6d62d72db9716c Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@suse.de>
Date: Sun, 8 Jan 2006 01:03:18 -0800
Subject: [PATCH] pivot_root: add comment

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/namespace.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fs/namespace.c b/fs/namespace.c
index 2019899f2a..e5aa1eeb57 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1526,6 +1526,10 @@ static void chroot_fs_refs(struct nameidata *old_nd, struct nameidata *new_nd)
  * pointed to by put_old must yield the same directory as new_root. No other
  * file system may be mounted on put_old. After all, new_root is a mountpoint.
  *
+ * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.
+ * See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives
+ * in this situation.
+ *
  * Notes:
  *  - we don't move root/cwd if they are not at the root (reason: if something
  *    cared enough to change them, it's probably wrong to force them elsewhere)
-- 
cgit v1.2.2


From bf066c7db775a04bd761f8ea206f5522d0cf40ff Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Sun, 8 Jan 2006 01:03:19 -0800
Subject: [PATCH] shared mounts: cleanup

Small cleanups in shared mounts code.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Cc: Ram Pai <linuxram@us.ibm.com>
Cc: <viro@parcelfarce.linux.theplanet.co.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/namespace.c        | 2 +-
 fs/pnode.c            | 2 +-
 include/linux/fs.h    | 2 +-
 include/linux/mount.h | 3 ++-
 4 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index e5aa1eeb57..3e8fb61ad5 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -451,7 +451,7 @@ EXPORT_SYMBOL(may_umount);
 void release_mounts(struct list_head *head)
 {
 	struct vfsmount *mnt;
-	while(!list_empty(head)) {
+	while (!list_empty(head)) {
 		mnt = list_entry(head->next, struct vfsmount, mnt_hash);
 		list_del_init(&mnt->mnt_hash);
 		if (mnt->mnt_parent != mnt) {
diff --git a/fs/pnode.c b/fs/pnode.c
index aeeec8ba8d..f1871f773f 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -103,7 +103,7 @@ static struct vfsmount *propagation_next(struct vfsmount *m,
 		struct vfsmount *next;
 		struct vfsmount *master = m->mnt_master;
 
-		if ( master == origin->mnt_master ) {
+		if (master == origin->mnt_master) {
 			next = next_peer(m);
 			return ((next == origin) ? NULL : next);
 		} else if (m->mnt_slave.next != &master->mnt_slave_list)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7f140480c6..a1e28f0895 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -103,11 +103,11 @@ extern int dir_notify_enable;
 #define MS_MOVE		8192
 #define MS_REC		16384
 #define MS_VERBOSE	32768
+#define MS_POSIXACL	(1<<16)	/* VFS does not apply the umask */
 #define MS_UNBINDABLE	(1<<17)	/* change to unbindable */
 #define MS_PRIVATE	(1<<18)	/* change to private */
 #define MS_SLAVE	(1<<19)	/* change to slave */
 #define MS_SHARED	(1<<20)	/* change to shared */
-#define MS_POSIXACL	(1<<16)	/* VFS does not apply the umask */
 #define MS_ACTIVE	(1<<30)
 #define MS_NOUSER	(1<<31)
 
diff --git a/include/linux/mount.h b/include/linux/mount.h
index dd4e83eba9..b98a709f17 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -22,7 +22,8 @@
 #define MNT_NOEXEC	0x04
 #define MNT_SHARED	0x10	/* if the vfsmount is a shared mount */
 #define MNT_UNBINDABLE	0x20	/* if the vfsmount is a unbindable mount */
-#define MNT_PNODE_MASK	0x30	/* propogation flag mask */
+
+#define MNT_PNODE_MASK	(MNT_SHARED | MNT_UNBINDABLE)
 
 struct vfsmount {
 	struct list_head mnt_hash;
-- 
cgit v1.2.2


From 71b9625744b7d4a6a2416389a5ba464bdf11f07f Mon Sep 17 00:00:00 2001
From: Johann Lombardi <johann.lombardi@bull.net>
Date: Sun, 8 Jan 2006 01:03:20 -0800
Subject: [PATCH] ext3: external journal device as a mount option

The patch below adds a new mount option to allow the external journal
device to be specified.

The syntax is as follows:
# mount -t ext3 -o journal_dev=0x0820 ...
where 0x0820 means major=8 and minor=32.

Signed-off-by: Johann Lombardi <johann.lombardi@bull.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/filesystems/ext3.txt |  5 ++++
 fs/ext3/super.c                    | 54 +++++++++++++++++++++++++++++++-------
 2 files changed, 49 insertions(+), 10 deletions(-)

diff --git a/Documentation/filesystems/ext3.txt b/Documentation/filesystems/ext3.txt
index 9840d5b8d5..22e4040564 100644
--- a/Documentation/filesystems/ext3.txt
+++ b/Documentation/filesystems/ext3.txt
@@ -22,6 +22,11 @@ journal=inum		When a journal already exists, this option is
 			the inode which will represent the ext3 file
 			system's journal file.
 
+journal_dev=devnum	When the external journal device's major/minor numbers
+			have changed, this option allows to specify the new
+			journal location. The journal device is identified
+			through its new major/minor numbers encoded in devnum.
+
 noload			Don't load the journal on mounting.
 
 data=journal		All data are committed into the journal prior
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 4e6730622d..7c45acf945 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -43,7 +43,8 @@
 #include "acl.h"
 #include "namei.h"
 
-static int ext3_load_journal(struct super_block *, struct ext3_super_block *);
+static int ext3_load_journal(struct super_block *, struct ext3_super_block *,
+			     unsigned long journal_devnum);
 static int ext3_create_journal(struct super_block *, struct ext3_super_block *,
 			       int);
 static void ext3_commit_super (struct super_block * sb,
@@ -628,7 +629,7 @@ enum {
 	Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov,
 	Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
 	Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh,
-	Opt_commit, Opt_journal_update, Opt_journal_inum,
+	Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
 	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
 	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
 	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
@@ -666,6 +667,7 @@ static match_table_t tokens = {
 	{Opt_commit, "commit=%u"},
 	{Opt_journal_update, "journal=update"},
 	{Opt_journal_inum, "journal=%u"},
+	{Opt_journal_dev, "journal_dev=%u"},
 	{Opt_abort, "abort"},
 	{Opt_data_journal, "data=journal"},
 	{Opt_data_ordered, "data=ordered"},
@@ -705,8 +707,9 @@ static unsigned long get_sb_block(void **data)
 	return sb_block;
 }
 
-static int parse_options (char * options, struct super_block *sb,
-			  unsigned long * inum, unsigned long *n_blocks_count, int is_remount)
+static int parse_options (char *options, struct super_block *sb,
+			  unsigned long *inum, unsigned long *journal_devnum,
+			  unsigned long *n_blocks_count, int is_remount)
 {
 	struct ext3_sb_info *sbi = EXT3_SB(sb);
 	char * p;
@@ -839,6 +842,16 @@ static int parse_options (char * options, struct super_block *sb,
 				return 0;
 			*inum = option;
 			break;
+		case Opt_journal_dev:
+			if (is_remount) {
+				printk(KERN_ERR "EXT3-fs: cannot specify "
+				       "journal on remount\n");
+				return 0;
+			}
+			if (match_int(&args[0], &option))
+				return 0;
+			*journal_devnum = option;
+			break;
 		case Opt_noload:
 			set_opt (sbi->s_mount_opt, NOLOAD);
 			break;
@@ -1331,6 +1344,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	unsigned long logic_sb_block;
 	unsigned long offset = 0;
 	unsigned long journal_inum = 0;
+	unsigned long journal_devnum = 0;
 	unsigned long def_mount_opts;
 	struct inode *root;
 	int blocksize;
@@ -1411,7 +1425,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 
 	set_opt(sbi->s_mount_opt, RESERVATION);
 
-	if (!parse_options ((char *) data, sb, &journal_inum, NULL, 0))
+	if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
+			    NULL, 0))
 		goto failed_mount;
 
 	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
@@ -1622,7 +1637,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	 */
 	if (!test_opt(sb, NOLOAD) &&
 	    EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) {
-		if (ext3_load_journal(sb, es))
+		if (ext3_load_journal(sb, es, journal_devnum))
 			goto failed_mount2;
 	} else if (journal_inum) {
 		if (ext3_create_journal(sb, es, journal_inum))
@@ -1902,15 +1917,24 @@ out_bdev:
 	return NULL;
 }
 
-static int ext3_load_journal(struct super_block * sb,
-			     struct ext3_super_block * es)
+static int ext3_load_journal(struct super_block *sb,
+			     struct ext3_super_block *es,
+			     unsigned long journal_devnum)
 {
 	journal_t *journal;
 	int journal_inum = le32_to_cpu(es->s_journal_inum);
-	dev_t journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
+	dev_t journal_dev;
 	int err = 0;
 	int really_read_only;
 
+	if (journal_devnum &&
+	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
+		printk(KERN_INFO "EXT3-fs: external journal device major/minor "
+			"numbers have changed\n");
+		journal_dev = new_decode_dev(journal_devnum);
+	} else
+		journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
+
 	really_read_only = bdev_read_only(sb->s_bdev);
 
 	/*
@@ -1969,6 +1993,16 @@ static int ext3_load_journal(struct super_block * sb,
 
 	EXT3_SB(sb)->s_journal = journal;
 	ext3_clear_journal_err(sb, es);
+
+	if (journal_devnum &&
+	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
+		es->s_journal_dev = cpu_to_le32(journal_devnum);
+		sb->s_dirt = 1;
+
+		/* Make sure we flush the recovery flag to disk. */
+		ext3_commit_super(sb, es, 1);
+	}
+
 	return 0;
 }
 
@@ -2197,7 +2231,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
 	/*
 	 * Allow the "check" option to be passed as a remount option.
 	 */
-	if (!parse_options(data, sb, NULL, &n_blocks_count, 1)) {
+	if (!parse_options(data, sb, NULL, NULL, &n_blocks_count, 1)) {
 		err = -EINVAL;
 		goto restore_opts;
 	}
-- 
cgit v1.2.2


From 25ab7cd84eebdc1869d236029ada3a7b403de8ba Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Sun, 8 Jan 2006 01:03:21 -0800
Subject: [PATCH] oprofile: Use vmalloc_node() in alloc_cpu_buffers()

Make oprofile alloc_cpu_buffers() function NUMA aware, allocating each CPU
local buffer in its memory node if possible.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Cc: Philippe Elie <phil.el@wanadoo.fr>
Cc: John Levon <levon@movementarian.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/oprofile/cpu_buffer.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/oprofile/cpu_buffer.c b/drivers/oprofile/cpu_buffer.c
index 026f671ea5..78193e4bbd 100644
--- a/drivers/oprofile/cpu_buffer.c
+++ b/drivers/oprofile/cpu_buffer.c
@@ -52,7 +52,8 @@ int alloc_cpu_buffers(void)
 	for_each_online_cpu(i) {
 		struct oprofile_cpu_buffer * b = &cpu_buffer[i];
  
-		b->buffer = vmalloc(sizeof(struct op_sample) * buffer_size);
+		b->buffer = vmalloc_node(sizeof(struct op_sample) * buffer_size,
+			cpu_to_node(i));
 		if (!b->buffer)
 			goto fail;
  
-- 
cgit v1.2.2


From 9f40668d7d14d4d16cedc2104bfb63a43584dacf Mon Sep 17 00:00:00 2001
From: Glauber de Oliveira Costa <glommer@br.ibm.com>
Date: Sun, 8 Jan 2006 01:03:22 -0800
Subject: [PATCH] ext3: remove trailing newlines from ext3_warning() calls

Remove the trailing newlines in calls to ext3_warning().  This function
already adds a trailing newline to the end of messages.

Signed-off-by: Glauber de Oliveira Costa <glommer@br.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext3/ialloc.c |  6 +++---
 fs/ext3/namei.c  |  2 +-
 fs/ext3/resize.c | 22 +++++++++++-----------
 3 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 9e4a243762..69078079b1 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -651,7 +651,7 @@ struct inode *ext3_orphan_get(struct super_block *sb, unsigned long ino)
 	/* Error cases - e2fsck has already cleaned up for us */
 	if (ino > max_ino) {
 		ext3_warning(sb, __FUNCTION__,
-			     "bad orphan ino %lu!  e2fsck was run?\n", ino);
+			     "bad orphan ino %lu!  e2fsck was run?", ino);
 		goto out;
 	}
 
@@ -660,7 +660,7 @@ struct inode *ext3_orphan_get(struct super_block *sb, unsigned long ino)
 	bitmap_bh = read_inode_bitmap(sb, block_group);
 	if (!bitmap_bh) {
 		ext3_warning(sb, __FUNCTION__,
-			     "inode bitmap error for orphan %lu\n", ino);
+			     "inode bitmap error for orphan %lu", ino);
 		goto out;
 	}
 
@@ -672,7 +672,7 @@ struct inode *ext3_orphan_get(struct super_block *sb, unsigned long ino)
 			!(inode = iget(sb, ino)) || is_bad_inode(inode) ||
 			NEXT_ORPHAN(inode) > max_ino) {
 		ext3_warning(sb, __FUNCTION__,
-			     "bad orphan inode %lu!  e2fsck was run?\n", ino);
+			     "bad orphan inode %lu!  e2fsck was run?", ino);
 		printk(KERN_NOTICE "ext3_test_bit(bit=%d, block=%llu) = %d\n",
 		       bit, (unsigned long long)bitmap_bh->b_blocknr,
 		       ext3_test_bit(bit, bitmap_bh->b_data));
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index b3c690a3b5..af193a304e 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1476,7 +1476,7 @@ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
 		if (levels && (dx_get_count(frames->entries) ==
 			       dx_get_limit(frames->entries))) {
 			ext3_warning(sb, __FUNCTION__,
-				     "Directory index full!\n");
+				     "Directory index full!");
 			err = -ENOSPC;
 			goto cleanup;
 		}
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 6104ad3105..675aa24b00 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -340,7 +340,7 @@ static int verify_reserved_gdb(struct super_block *sb,
 	while ((grp = ext3_list_backups(sb, &three, &five, &seven)) < end) {
 		if (le32_to_cpu(*p++) != grp * EXT3_BLOCKS_PER_GROUP(sb) + blk){
 			ext3_warning(sb, __FUNCTION__,
-				     "reserved GDT %ld missing grp %d (%ld)\n",
+				     "reserved GDT %ld missing grp %d (%ld)",
 				     blk, grp,
 				     grp * EXT3_BLOCKS_PER_GROUP(sb) + blk);
 			return -EINVAL;
@@ -393,7 +393,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 	if (EXT3_SB(sb)->s_sbh->b_blocknr !=
 	    le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block)) {
 		ext3_warning(sb, __FUNCTION__,
-			"won't resize using backup superblock at %llu\n",
+			"won't resize using backup superblock at %llu",
 			(unsigned long long)EXT3_SB(sb)->s_sbh->b_blocknr);
 		return -EPERM;
 	}
@@ -417,7 +417,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 	data = (__u32 *)dind->b_data;
 	if (le32_to_cpu(data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)]) != gdblock) {
 		ext3_warning(sb, __FUNCTION__,
-			     "new group %u GDT block %lu not reserved\n",
+			     "new group %u GDT block %lu not reserved",
 			     input->group, gdblock);
 		err = -EINVAL;
 		goto exit_dind;
@@ -540,7 +540,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
 	for (res = 0; res < reserved_gdb; res++, blk++) {
 		if (le32_to_cpu(*data) != blk) {
 			ext3_warning(sb, __FUNCTION__,
-				     "reserved block %lu not at offset %ld\n",
+				     "reserved block %lu not at offset %ld",
 				     blk, (long)(data - (__u32 *)dind->b_data));
 			err = -EINVAL;
 			goto exit_bh;
@@ -683,7 +683,7 @@ exit_err:
 	if (err) {
 		ext3_warning(sb, __FUNCTION__,
 			     "can't update backup for group %d (err %d), "
-			     "forcing fsck on next reboot\n", group, err);
+			     "forcing fsck on next reboot", group, err);
 		sbi->s_mount_state &= ~EXT3_VALID_FS;
 		sbi->s_es->s_state &= ~cpu_to_le16(EXT3_VALID_FS);
 		mark_buffer_dirty(sbi->s_sbh);
@@ -722,7 +722,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
 	if (gdb_off == 0 && !EXT3_HAS_RO_COMPAT_FEATURE(sb,
 					EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
 		ext3_warning(sb, __FUNCTION__,
-			     "Can't resize non-sparse filesystem further\n");
+			     "Can't resize non-sparse filesystem further");
 		return -EPERM;
 	}
 
@@ -730,13 +730,13 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
 		if (!EXT3_HAS_COMPAT_FEATURE(sb,
 					     EXT3_FEATURE_COMPAT_RESIZE_INODE)){
 			ext3_warning(sb, __FUNCTION__,
-				     "No reserved GDT blocks, can't resize\n");
+				     "No reserved GDT blocks, can't resize");
 			return -EPERM;
 		}
 		inode = iget(sb, EXT3_RESIZE_INO);
 		if (!inode || is_bad_inode(inode)) {
 			ext3_warning(sb, __FUNCTION__,
-				     "Error opening resize inode\n");
+				     "Error opening resize inode");
 			iput(inode);
 			return -ENOENT;
 		}
@@ -766,7 +766,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
 	lock_super(sb);
 	if (input->group != EXT3_SB(sb)->s_groups_count) {
 		ext3_warning(sb, __FUNCTION__,
-			     "multiple resizers run on filesystem!\n");
+			     "multiple resizers run on filesystem!");
 		err = -EBUSY;
 		goto exit_journal;
 	}
@@ -937,7 +937,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
 
 	if (last == 0) {
 		ext3_warning(sb, __FUNCTION__,
-			     "need to use ext2online to resize further\n");
+			     "need to use ext2online to resize further");
 		return -EPERM;
 	}
 
@@ -973,7 +973,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
 	lock_super(sb);
 	if (o_blocks_count != le32_to_cpu(es->s_blocks_count)) {
 		ext3_warning(sb, __FUNCTION__,
-			     "multiple resizers run on filesystem!\n");
+			     "multiple resizers run on filesystem!");
 		err = -EBUSY;
 		goto exit_put;
 	}
-- 
cgit v1.2.2


From 29ba17231222c42ca3df5424f43949e2a6fddec2 Mon Sep 17 00:00:00 2001
From: Glauber de Oliveira Costa <glommer@br.ibm.com>
Date: Sun, 8 Jan 2006 01:03:23 -0800
Subject: [PATCH] ext3: use sbi instead of EXT3_SB() in resize code.

There are places in the resize code in which EXT3_SB() macro is used after
an statement like sbi = EXT3_SB(sb) is done.  Inside the same function,
both sbi and EXT3_SB() are used to reference the super block Altough it is
not wrong, keeping it coherent increases legibility, IMHO.

Signed-off-by: Glauber de Oliveira Costa <glommer@br.ibm.com>
Cc: "Stephen C. Tweedie" <sct@redhat.com>
Cc: Andreas Dilger <adilger@clusterfs.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext3/resize.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 675aa24b00..1041dab6de 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -31,7 +31,7 @@ static int verify_group_input(struct super_block *sb,
 	unsigned start = le32_to_cpu(es->s_blocks_count);
 	unsigned end = start + input->blocks_count;
 	unsigned group = input->group;
-	unsigned itend = input->inode_table + EXT3_SB(sb)->s_itb_per_group;
+	unsigned itend = input->inode_table + sbi->s_itb_per_group;
 	unsigned overhead = ext3_bg_has_super(sb, group) ?
 		(1 + ext3_bg_num_gdb(sb, group) +
 		 le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
@@ -764,7 +764,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
 	}
 
 	lock_super(sb);
-	if (input->group != EXT3_SB(sb)->s_groups_count) {
+	if (input->group != sbi->s_groups_count) {
 		ext3_warning(sb, __FUNCTION__,
 			     "multiple resizers run on filesystem!");
 		err = -EBUSY;
@@ -799,7 +799,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
 	 * data.  So we need to be careful to set all of the relevant
 	 * group descriptor data etc. *before* we enable the group.
 	 *
-	 * The key field here is EXT3_SB(sb)->s_groups_count: as long as
+	 * The key field here is sbi->s_groups_count: as long as
 	 * that retains its old value, nobody is going to access the new
 	 * group.
 	 *
@@ -859,7 +859,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
 	smp_wmb();
 
 	/* Update the global fs size fields */
-	EXT3_SB(sb)->s_groups_count++;
+	sbi->s_groups_count++;
 
 	ext3_journal_dirty_metadata(handle, primary);
 
@@ -874,7 +874,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
 	percpu_counter_mod(&sbi->s_freeinodes_counter,
 			   EXT3_INODES_PER_GROUP(sb));
 
-	ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
+	ext3_journal_dirty_metadata(handle, sbi->s_sbh);
 	sb->s_dirt = 1;
 
 exit_journal:
-- 
cgit v1.2.2


From 08efd10edff199f0c992f04052e897a3f2048508 Mon Sep 17 00:00:00 2001
From: Nicolas Kaiser <nikai@nikai.net>
Date: Sun, 8 Jan 2006 01:03:26 -0800
Subject: [PATCH] MAINTAINERS: line duplication

uniq -d MAINTAINERS

Signed-off-by: Nicolas Kaiser <nikai@nikai.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 MAINTAINERS | 1 -
 1 file changed, 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 7e780906d3..76dc820bc8 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -927,7 +927,6 @@ S:	Maintained
 FARSYNC SYNCHRONOUS DRIVER
 P:	Kevin Curtis
 M:	kevin.curtis@farsite.co.uk
-M:	kevin.curtis@farsite.co.uk
 W:	http://www.farsite.co.uk/
 S:	Supported
 
-- 
cgit v1.2.2


From 86174cdcb44c095ffd2e3ca69caa56244cf701d5 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 8 Jan 2006 01:03:28 -0800
Subject: [PATCH] remove unneeded sig->curr_target recalculation

This patch removes unneeded sig->curr_target recalculation under 'if
(atomic_dec_and_test(&sig->count))' in __exit_signal().

When sig->count == 0 the signal can't be sent to this task and
next_thread(tsk) == tsk anyway.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index 114cf9209b..08aa5b263f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -364,8 +364,6 @@ void __exit_signal(struct task_struct *tsk)
 	posix_cpu_timers_exit(tsk);
 	if (atomic_dec_and_test(&sig->count)) {
 		posix_cpu_timers_exit_group(tsk);
-		if (tsk == sig->curr_target)
-			sig->curr_target = next_thread(tsk);
 		tsk->signal = NULL;
 		__exit_sighand(tsk);
 		spin_unlock(&sighand->siglock);
-- 
cgit v1.2.2


From 850d6fbe70c62a9792eac3e8ef34f2f09f209895 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 8 Jan 2006 01:03:29 -0800
Subject: [PATCH] sigio: cleanup, don't take tasklist twice

The only user of send_sigio_to_task() already holds tasklist_lock, so it is
better not to send the signal via send_group_sig_info() (which takes
tasklist recursively) but use group_send_sig_info().

The same change in send_sigurg()->send_sigurg_to_task().

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fcntl.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/fcntl.c b/fs/fcntl.c
index 863b46e0d7..9903bde475 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -457,11 +457,11 @@ static void send_sigio_to_task(struct task_struct *p,
 			else
 				si.si_band = band_table[reason - POLL_IN];
 			si.si_fd    = fd;
-			if (!send_group_sig_info(fown->signum, &si, p))
+			if (!group_send_sig_info(fown->signum, &si, p))
 				break;
 		/* fall-through: fall back on the old plain SIGIO signal */
 		case 0:
-			send_group_sig_info(SIGIO, SEND_SIG_PRIV, p);
+			group_send_sig_info(SIGIO, SEND_SIG_PRIV, p);
 	}
 }
 
@@ -495,7 +495,7 @@ static void send_sigurg_to_task(struct task_struct *p,
                                 struct fown_struct *fown)
 {
 	if (sigio_perm(p, fown, SIGURG))
-		send_group_sig_info(SIGURG, SEND_SIG_PRIV, p);
+		group_send_sig_info(SIGURG, SEND_SIG_PRIV, p);
 }
 
 int send_sigurg(struct fown_struct *fown)
-- 
cgit v1.2.2


From 21b6bf143d05d77c350d9c6764ae090a877b66ea Mon Sep 17 00:00:00 2001
From: Jorn Dreyer <j.dreyer@butonic.de>
Date: Sun, 8 Jan 2006 01:03:30 -0800
Subject: [PATCH] nfsroot: do not silently stop parsing on an unknown option

It would be helpful if the kernel did not silently stop parsing
nfs options, but instead warned about any he does not recognize. The
attached patch adds one printk to do just that.

It took me a couple of hours to find my configuration mistake.

Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfs/nfsroot.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 985cc53b8d..e897e00c2c 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -275,7 +275,9 @@ static int __init root_nfs_parse(char *name, char *buf)
 			case Opt_noacl:
 				nfs_data.flags |= NFS_MOUNT_NOACL;
 				break;
-			default : 
+			default:
+				printk(KERN_WARNING "Root-NFS: unknown "
+					"option: %s\n", p);
 				return 0;
 		}
 	}
-- 
cgit v1.2.2


From 5160ee6fc891a9ca114be0e90fa6655647bb64b2 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Sun, 8 Jan 2006 01:03:32 -0800
Subject: [PATCH] shrink dentry struct

Some long time ago, dentry struct was carefully tuned so that on 32 bits
UP, sizeof(struct dentry) was exactly 128, ie a power of 2, and a multiple
of memory cache lines.

Then RCU was added and dentry struct enlarged by two pointers, with nice
results for SMP, but not so good on UP, because breaking the above tuning
(128 + 8 = 136 bytes)

This patch reverts this unwanted side effect, by using an union (d_u),
where d_rcu and d_child are placed so that these two fields can share their
memory needs.

At the time d_free() is called (and d_rcu is really used), d_child is known
to be empty and not touched by the dentry freeing.

Lockless lookups only access d_name, d_parent, d_lock, d_op, d_flags (so
the previous content of d_child is not needed if said dentry was unhashed
but still accessed by a CPU because of RCU constraints)

As dentry cache easily contains millions of entries, a size reduction is
worth the extra complexity of the ugly C union.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Cc: Dipankar Sarma <dipankar@in.ibm.com>
Cc: Maneesh Soni <maneesh@in.ibm.com>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Cc: Ian Kent <raven@themaw.net>
Cc: Paul Jackson <pj@sgi.com>
Cc: Al Viro <viro@ftp.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Neil Brown <neilb@cse.unsw.edu.au>
Cc: James Morris <jmorris@namei.org>
Cc: Stephen Smalley <sds@epoch.ncsc.mil>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/usb/core/inode.c     |  6 +++---
 fs/autofs4/autofs_i.h        |  2 +-
 fs/autofs4/expire.c          | 12 ++++++------
 fs/autofs4/inode.c           |  4 ++--
 fs/autofs4/root.c            |  3 ++-
 fs/coda/cache.c              |  2 +-
 fs/dcache.c                  | 34 +++++++++++++++++-----------------
 fs/libfs.c                   | 12 ++++++------
 fs/ncpfs/dir.c               |  2 +-
 fs/ncpfs/ncplib_kernel.h     |  4 ++--
 fs/smbfs/cache.c             |  4 ++--
 include/linux/dcache.h       |  9 +++++++--
 kernel/cpuset.c              |  4 ++--
 net/sunrpc/rpc_pipe.c        |  2 +-
 security/selinux/selinuxfs.c |  2 +-
 15 files changed, 54 insertions(+), 48 deletions(-)

diff --git a/drivers/usb/core/inode.c b/drivers/usb/core/inode.c
index c44bbedec8..4ddc453023 100644
--- a/drivers/usb/core/inode.c
+++ b/drivers/usb/core/inode.c
@@ -186,7 +186,7 @@ static void update_bus(struct dentry *bus)
 
 	down(&bus->d_inode->i_sem);
 
-	list_for_each_entry(dev, &bus->d_subdirs, d_child)
+	list_for_each_entry(dev, &bus->d_subdirs, d_u.d_child)
 		if (dev->d_inode)
 			update_dev(dev);
 
@@ -203,7 +203,7 @@ static void update_sb(struct super_block *sb)
 
 	down(&root->d_inode->i_sem);
 
-	list_for_each_entry(bus, &root->d_subdirs, d_child) {
+	list_for_each_entry(bus, &root->d_subdirs, d_u.d_child) {
 		if (bus->d_inode) {
 			switch (S_IFMT & bus->d_inode->i_mode) {
 			case S_IFDIR:
@@ -319,7 +319,7 @@ static int usbfs_empty (struct dentry *dentry)
 	spin_lock(&dcache_lock);
 
 	list_for_each(list, &dentry->d_subdirs) {
-		struct dentry *de = list_entry(list, struct dentry, d_child);
+		struct dentry *de = list_entry(list, struct dentry, d_u.d_child);
 		if (usbfs_positive(de)) {
 			spin_unlock(&dcache_lock);
 			return 0;
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index fca83e28ed..385bed09b0 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -209,7 +209,7 @@ static inline int simple_empty_nolock(struct dentry *dentry)
 	struct dentry *child;
 	int ret = 0;
 
-	list_for_each_entry(child, &dentry->d_subdirs, d_child)
+	list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child)
 		if (simple_positive(child))
 			goto out;
 	ret = 1;
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index feb6ac427d..dc39589df1 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -105,7 +105,7 @@ repeat:
 	next = this_parent->d_subdirs.next;
 resume:
 	while (next != &this_parent->d_subdirs) {
-		struct dentry *dentry = list_entry(next, struct dentry, d_child);
+		struct dentry *dentry = list_entry(next, struct dentry, d_u.d_child);
 
 		/* Negative dentry - give up */
 		if (!simple_positive(dentry)) {
@@ -138,7 +138,7 @@ resume:
 	}
 
 	if (this_parent != top) {
-		next = this_parent->d_child.next;
+		next = this_parent->d_u.d_child.next;
 		this_parent = this_parent->d_parent;
 		goto resume;
 	}
@@ -163,7 +163,7 @@ repeat:
 	next = this_parent->d_subdirs.next;
 resume:
 	while (next != &this_parent->d_subdirs) {
-		struct dentry *dentry = list_entry(next, struct dentry, d_child);
+		struct dentry *dentry = list_entry(next, struct dentry, d_u.d_child);
 
 		/* Negative dentry - give up */
 		if (!simple_positive(dentry)) {
@@ -199,7 +199,7 @@ cont:
 	}
 
 	if (this_parent != parent) {
-		next = this_parent->d_child.next;
+		next = this_parent->d_u.d_child.next;
 		this_parent = this_parent->d_parent;
 		goto resume;
 	}
@@ -238,7 +238,7 @@ static struct dentry *autofs4_expire(struct super_block *sb,
 	/* On exit from the loop expire is set to a dgot dentry
 	 * to expire or it's NULL */
 	while ( next != &root->d_subdirs ) {
-		struct dentry *dentry = list_entry(next, struct dentry, d_child);
+		struct dentry *dentry = list_entry(next, struct dentry, d_u.d_child);
 
 		/* Negative dentry - give up */
 		if ( !simple_positive(dentry) ) {
@@ -302,7 +302,7 @@ next:
 			expired, (int)expired->d_name.len, expired->d_name.name);
 		spin_lock(&dcache_lock);
 		list_del(&expired->d_parent->d_subdirs);
-		list_add(&expired->d_parent->d_subdirs, &expired->d_child);
+		list_add(&expired->d_parent->d_subdirs, &expired->d_u.d_child);
 		spin_unlock(&dcache_lock);
 		return expired;
 	}
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 818b37be51..2d3082854a 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -91,7 +91,7 @@ repeat:
 	next = this_parent->d_subdirs.next;
 resume:
 	while (next != &this_parent->d_subdirs) {
-		struct dentry *dentry = list_entry(next, struct dentry, d_child);
+		struct dentry *dentry = list_entry(next, struct dentry, d_u.d_child);
 
 		/* Negative dentry - don`t care */
 		if (!simple_positive(dentry)) {
@@ -117,7 +117,7 @@ resume:
 	if (this_parent != sbi->root) {
 		struct dentry *dentry = this_parent;
 
-		next = this_parent->d_child.next;
+		next = this_parent->d_u.d_child.next;
 		this_parent = this_parent->d_parent;
 		spin_unlock(&dcache_lock);
 		DPRINTK("parent dentry %p %.*s",
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 2a771ec669..2241405ffc 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -143,7 +143,8 @@ static int autofs4_dcache_readdir(struct file * filp, void * dirent, filldir_t f
 			}
 
 			while(1) {
-				struct dentry *de = list_entry(list, struct dentry, d_child);
+				struct dentry *de = list_entry(list,
+						struct dentry, d_u.d_child);
 
 				if (!d_unhashed(de) && de->d_inode) {
 					spin_unlock(&dcache_lock);
diff --git a/fs/coda/cache.c b/fs/coda/cache.c
index 80072fd9b7..c607d92335 100644
--- a/fs/coda/cache.c
+++ b/fs/coda/cache.c
@@ -93,7 +93,7 @@ static void coda_flag_children(struct dentry *parent, int flag)
 	spin_lock(&dcache_lock);
 	list_for_each(child, &parent->d_subdirs)
 	{
-		de = list_entry(child, struct dentry, d_child);
+		de = list_entry(child, struct dentry, d_u.d_child);
 		/* don't know what to do with negative dentries */
 		if ( ! de->d_inode ) 
 			continue;
diff --git a/fs/dcache.c b/fs/dcache.c
index 17e4391386..1536f15c4d 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -71,7 +71,7 @@ struct dentry_stat_t dentry_stat = {
 
 static void d_callback(struct rcu_head *head)
 {
-	struct dentry * dentry = container_of(head, struct dentry, d_rcu);
+	struct dentry * dentry = container_of(head, struct dentry, d_u.d_rcu);
 
 	if (dname_external(dentry))
 		kfree(dentry->d_name.name);
@@ -86,7 +86,7 @@ static void d_free(struct dentry *dentry)
 {
 	if (dentry->d_op && dentry->d_op->d_release)
 		dentry->d_op->d_release(dentry);
- 	call_rcu(&dentry->d_rcu, d_callback);
+ 	call_rcu(&dentry->d_u.d_rcu, d_callback);
 }
 
 /*
@@ -193,7 +193,7 @@ kill_it: {
   			list_del(&dentry->d_lru);
   			dentry_stat.nr_unused--;
   		}
-  		list_del(&dentry->d_child);
+  		list_del(&dentry->d_u.d_child);
 		dentry_stat.nr_dentry--;	/* For d_free, below */
 		/*drops the locks, at that point nobody can reach this dentry */
 		dentry_iput(dentry);
@@ -367,7 +367,7 @@ static inline void prune_one_dentry(struct dentry * dentry)
 	struct dentry * parent;
 
 	__d_drop(dentry);
-	list_del(&dentry->d_child);
+	list_del(&dentry->d_u.d_child);
 	dentry_stat.nr_dentry--;	/* For d_free, below */
 	dentry_iput(dentry);
 	parent = dentry->d_parent;
@@ -518,7 +518,7 @@ repeat:
 resume:
 	while (next != &this_parent->d_subdirs) {
 		struct list_head *tmp = next;
-		struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
+		struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
 		next = tmp->next;
 		/* Have we found a mount point ? */
 		if (d_mountpoint(dentry))
@@ -532,7 +532,7 @@ resume:
 	 * All done at this level ... ascend and resume the search.
 	 */
 	if (this_parent != parent) {
-		next = this_parent->d_child.next; 
+		next = this_parent->d_u.d_child.next;
 		this_parent = this_parent->d_parent;
 		goto resume;
 	}
@@ -569,7 +569,7 @@ repeat:
 resume:
 	while (next != &this_parent->d_subdirs) {
 		struct list_head *tmp = next;
-		struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
+		struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
 		next = tmp->next;
 
 		if (!list_empty(&dentry->d_lru)) {
@@ -610,7 +610,7 @@ dentry->d_parent->d_name.name, dentry->d_name.name, found);
 	 * All done at this level ... ascend and resume the search.
 	 */
 	if (this_parent != parent) {
-		next = this_parent->d_child.next; 
+		next = this_parent->d_u.d_child.next;
 		this_parent = this_parent->d_parent;
 #ifdef DCACHE_DEBUG
 printk(KERN_DEBUG "select_parent: ascending to %s/%s, found=%d\n",
@@ -753,12 +753,12 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
 		dentry->d_parent = dget(parent);
 		dentry->d_sb = parent->d_sb;
 	} else {
-		INIT_LIST_HEAD(&dentry->d_child);
+		INIT_LIST_HEAD(&dentry->d_u.d_child);
 	}
 
 	spin_lock(&dcache_lock);
 	if (parent)
-		list_add(&dentry->d_child, &parent->d_subdirs);
+		list_add(&dentry->d_u.d_child, &parent->d_subdirs);
 	dentry_stat.nr_dentry++;
 	spin_unlock(&dcache_lock);
 
@@ -1310,8 +1310,8 @@ already_unhashed:
 	/* Unhash the target: dput() will then get rid of it */
 	__d_drop(target);
 
-	list_del(&dentry->d_child);
-	list_del(&target->d_child);
+	list_del(&dentry->d_u.d_child);
+	list_del(&target->d_u.d_child);
 
 	/* Switch the names.. */
 	switch_names(dentry, target);
@@ -1322,15 +1322,15 @@ already_unhashed:
 	if (IS_ROOT(dentry)) {
 		dentry->d_parent = target->d_parent;
 		target->d_parent = target;
-		INIT_LIST_HEAD(&target->d_child);
+		INIT_LIST_HEAD(&target->d_u.d_child);
 	} else {
 		do_switch(dentry->d_parent, target->d_parent);
 
 		/* And add them back to the (new) parent lists */
-		list_add(&target->d_child, &target->d_parent->d_subdirs);
+		list_add(&target->d_u.d_child, &target->d_parent->d_subdirs);
 	}
 
-	list_add(&dentry->d_child, &dentry->d_parent->d_subdirs);
+	list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs);
 	spin_unlock(&target->d_lock);
 	spin_unlock(&dentry->d_lock);
 	write_sequnlock(&rename_lock);
@@ -1568,7 +1568,7 @@ repeat:
 resume:
 	while (next != &this_parent->d_subdirs) {
 		struct list_head *tmp = next;
-		struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
+		struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
 		next = tmp->next;
 		if (d_unhashed(dentry)||!dentry->d_inode)
 			continue;
@@ -1579,7 +1579,7 @@ resume:
 		atomic_dec(&dentry->d_count);
 	}
 	if (this_parent != root) {
-		next = this_parent->d_child.next; 
+		next = this_parent->d_u.d_child.next;
 		atomic_dec(&this_parent->d_count);
 		this_parent = this_parent->d_parent;
 		goto resume;
diff --git a/fs/libfs.c b/fs/libfs.c
index 58101dff2c..9c50523382 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -93,16 +93,16 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin)
 			loff_t n = file->f_pos - 2;
 
 			spin_lock(&dcache_lock);
-			list_del(&cursor->d_child);
+			list_del(&cursor->d_u.d_child);
 			p = file->f_dentry->d_subdirs.next;
 			while (n && p != &file->f_dentry->d_subdirs) {
 				struct dentry *next;
-				next = list_entry(p, struct dentry, d_child);
+				next = list_entry(p, struct dentry, d_u.d_child);
 				if (!d_unhashed(next) && next->d_inode)
 					n--;
 				p = p->next;
 			}
-			list_add_tail(&cursor->d_child, p);
+			list_add_tail(&cursor->d_u.d_child, p);
 			spin_unlock(&dcache_lock);
 		}
 	}
@@ -126,7 +126,7 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
 {
 	struct dentry *dentry = filp->f_dentry;
 	struct dentry *cursor = filp->private_data;
-	struct list_head *p, *q = &cursor->d_child;
+	struct list_head *p, *q = &cursor->d_u.d_child;
 	ino_t ino;
 	int i = filp->f_pos;
 
@@ -153,7 +153,7 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
 			}
 			for (p=q->next; p != &dentry->d_subdirs; p=p->next) {
 				struct dentry *next;
-				next = list_entry(p, struct dentry, d_child);
+				next = list_entry(p, struct dentry, d_u.d_child);
 				if (d_unhashed(next) || !next->d_inode)
 					continue;
 
@@ -261,7 +261,7 @@ int simple_empty(struct dentry *dentry)
 	int ret = 0;
 
 	spin_lock(&dcache_lock);
-	list_for_each_entry(child, &dentry->d_subdirs, d_child)
+	list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child)
 		if (simple_positive(child))
 			goto out;
 	ret = 1;
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index a9f7a8ab1d..cfd76f431d 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -365,7 +365,7 @@ ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
 	spin_lock(&dcache_lock);
 	next = parent->d_subdirs.next;
 	while (next != &parent->d_subdirs) {
-		dent = list_entry(next, struct dentry, d_child);
+		dent = list_entry(next, struct dentry, d_u.d_child);
 		if ((unsigned long)dent->d_fsdata == fpos) {
 			if (dent->d_inode)
 				dget_locked(dent);
diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h
index 9e4dc30c24..799e5c2bec 100644
--- a/fs/ncpfs/ncplib_kernel.h
+++ b/fs/ncpfs/ncplib_kernel.h
@@ -196,7 +196,7 @@ ncp_renew_dentries(struct dentry *parent)
 	spin_lock(&dcache_lock);
 	next = parent->d_subdirs.next;
 	while (next != &parent->d_subdirs) {
-		dentry = list_entry(next, struct dentry, d_child);
+		dentry = list_entry(next, struct dentry, d_u.d_child);
 
 		if (dentry->d_fsdata == NULL)
 			ncp_age_dentry(server, dentry);
@@ -218,7 +218,7 @@ ncp_invalidate_dircache_entries(struct dentry *parent)
 	spin_lock(&dcache_lock);
 	next = parent->d_subdirs.next;
 	while (next != &parent->d_subdirs) {
-		dentry = list_entry(next, struct dentry, d_child);
+		dentry = list_entry(next, struct dentry, d_u.d_child);
 		dentry->d_fsdata = NULL;
 		ncp_age_dentry(server, dentry);
 		next = next->next;
diff --git a/fs/smbfs/cache.c b/fs/smbfs/cache.c
index f3e6b81288..74b86d9725 100644
--- a/fs/smbfs/cache.c
+++ b/fs/smbfs/cache.c
@@ -66,7 +66,7 @@ smb_invalidate_dircache_entries(struct dentry *parent)
 	spin_lock(&dcache_lock);
 	next = parent->d_subdirs.next;
 	while (next != &parent->d_subdirs) {
-		dentry = list_entry(next, struct dentry, d_child);
+		dentry = list_entry(next, struct dentry, d_u.d_child);
 		dentry->d_fsdata = NULL;
 		smb_age_dentry(server, dentry);
 		next = next->next;
@@ -100,7 +100,7 @@ smb_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
 	spin_lock(&dcache_lock);
 	next = parent->d_subdirs.next;
 	while (next != &parent->d_subdirs) {
-		dent = list_entry(next, struct dentry, d_child);
+		dent = list_entry(next, struct dentry, d_u.d_child);
 		if ((unsigned long)dent->d_fsdata == fpos) {
 			if (dent->d_inode)
 				dget_locked(dent);
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 46a2ba6175..a3ed5e059d 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -95,14 +95,19 @@ struct dentry {
 	struct qstr d_name;
 
 	struct list_head d_lru;		/* LRU list */
-	struct list_head d_child;	/* child of parent list */
+	/*
+	 * d_child and d_rcu can share memory
+	 */
+	union {
+		struct list_head d_child;	/* child of parent list */
+	 	struct rcu_head d_rcu;
+	} d_u;
 	struct list_head d_subdirs;	/* our children */
 	struct list_head d_alias;	/* inode alias list */
 	unsigned long d_time;		/* used by d_revalidate */
 	struct dentry_operations *d_op;
 	struct super_block *d_sb;	/* The root of the dentry tree */
 	void *d_fsdata;			/* fs-specific data */
- 	struct rcu_head d_rcu;
 	struct dcookie_struct *d_cookie; /* cookie, if any */
 	int d_mounted;
 	unsigned char d_iname[DNAME_INLINE_LEN_MIN];	/* small names */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index e04c2da9da..eab64e23bc 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -331,7 +331,7 @@ static void cpuset_d_remove_dir(struct dentry *dentry)
 	spin_lock(&dcache_lock);
 	node = dentry->d_subdirs.next;
 	while (node != &dentry->d_subdirs) {
-		struct dentry *d = list_entry(node, struct dentry, d_child);
+		struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
 		list_del_init(node);
 		if (d->d_inode) {
 			d = dget_locked(d);
@@ -343,7 +343,7 @@ static void cpuset_d_remove_dir(struct dentry *dentry)
 		}
 		node = dentry->d_subdirs.next;
 	}
-	list_del_init(&dentry->d_child);
+	list_del_init(&dentry->d_u.d_child);
 	spin_unlock(&dcache_lock);
 	remove_dir(dentry);
 }
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 24cc23af9b..e14c1cae74 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -495,7 +495,7 @@ rpc_depopulate(struct dentry *parent)
 repeat:
 	spin_lock(&dcache_lock);
 	list_for_each_safe(pos, next, &parent->d_subdirs) {
-		dentry = list_entry(pos, struct dentry, d_child);
+		dentry = list_entry(pos, struct dentry, d_u.d_child);
 		spin_lock(&dentry->d_lock);
 		if (!d_unhashed(dentry)) {
 			dget_locked(dentry);
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index e59da6398d..b5fa02d17b 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -889,7 +889,7 @@ static void sel_remove_bools(struct dentry *de)
 	spin_lock(&dcache_lock);
 	node = de->d_subdirs.next;
 	while (node != &de->d_subdirs) {
-		struct dentry *d = list_entry(node, struct dentry, d_child);
+		struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
 		list_del_init(node);
 
 		if (d->d_inode) {
-- 
cgit v1.2.2


From cc398c2eae35b13d77b77337136325edc6ca94ca Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sun, 8 Jan 2006 01:03:34 -0800
Subject: [PATCH] drivers/connector/cn_proc.c typos

The parameter to put_cpu_var() is unreferenced by the implementation, and
the compiler doesn't try to comprehend comments, so this wouldn't cause any
problem, but if bugged me enough to post a fix :-)

Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/connector/cn_proc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/connector/cn_proc.c b/drivers/connector/cn_proc.c
index 969d2b4aae..385e52930c 100644
--- a/drivers/connector/cn_proc.c
+++ b/drivers/connector/cn_proc.c
@@ -34,14 +34,14 @@
 static atomic_t proc_event_num_listeners = ATOMIC_INIT(0);
 static struct cb_id cn_proc_event_id = { CN_IDX_PROC, CN_VAL_PROC };
 
-/* proc_counts is used as the sequence number of the netlink message */
+/* proc_event_counts is used as the sequence number of the netlink message */
 static DEFINE_PER_CPU(__u32, proc_event_counts) = { 0 };
 
 static inline void get_seq(__u32 *ts, int *cpu)
 {
 	*ts = get_cpu_var(proc_event_counts)++;
 	*cpu = smp_processor_id();
-	put_cpu_var(proc_counts);
+	put_cpu_var(proc_event_counts);
 }
 
 void proc_fork_connector(struct task_struct *task)
-- 
cgit v1.2.2


From dda6ebde96044e9b5f1b14588659b39b4e6c08e7 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Sun, 8 Jan 2006 01:03:35 -0800
Subject: [PATCH] Fix handling of ELF segments with zero filesize

mmap() returns -EINVAL if given a zero length, and thus elf_map() in
binfmt_elf.c does likewise if it attempts to map a (page-aligned) ELF
segment with zero filesize.  Such a situation never arises with the default
linker scripts, but there's nothing inherently wrong with zero-filesize
(but non-zero memsize) ELF segments.  Custom linker scripts can generate
them, and the kernel should be able to map them; this patch makes it so.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/binfmt_elf.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index f36f221020..288386b1de 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -288,11 +288,17 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
 			struct elf_phdr *eppnt, int prot, int type)
 {
 	unsigned long map_addr;
+	unsigned long pageoffset = ELF_PAGEOFFSET(eppnt->p_vaddr);
 
 	down_write(&current->mm->mmap_sem);
-	map_addr = do_mmap(filep, ELF_PAGESTART(addr),
-			   eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr), prot, type,
-			   eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr));
+	/* mmap() will return -EINVAL if given a zero size, but a
+	 * segment with zero filesize is perfectly valid */
+	if (eppnt->p_filesz + pageoffset)
+		map_addr = do_mmap(filep, ELF_PAGESTART(addr),
+				   eppnt->p_filesz + pageoffset, prot, type,
+				   eppnt->p_offset - pageoffset);
+	else
+		map_addr = ELF_PAGESTART(addr);
 	up_write(&current->mm->mmap_sem);
 	return(map_addr);
 }
-- 
cgit v1.2.2


From 44fce35f29a7f2d976d9160bfbc55635b459a6a0 Mon Sep 17 00:00:00 2001
From: Nicolas Kaiser <nikai@nikai.net>
Date: Sun, 8 Jan 2006 01:03:37 -0800
Subject: [PATCH] drivers/mfd: header included twice

linux/delay.h included twice

Signed-off-by: Nicolas Kaiser <nikai@nikai.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/mfd/ucb1x00-ts.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/mfd/ucb1x00-ts.c b/drivers/mfd/ucb1x00-ts.c
index 551061c2ea..79fd062ccb 100644
--- a/drivers/mfd/ucb1x00-ts.c
+++ b/drivers/mfd/ucb1x00-ts.c
@@ -32,7 +32,6 @@
 #include <linux/suspend.h>
 #include <linux/slab.h>
 #include <linux/kthread.h>
-#include <linux/delay.h>
 
 #include <asm/dma.h>
 #include <asm/semaphore.h>
-- 
cgit v1.2.2


From 90f2447d08a5fbddc8b58f8a6425b0513807f919 Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jesper.juhl@gmail.com>
Date: Sun, 8 Jan 2006 01:03:38 -0800
Subject: [PATCH] Documentation: Small applying-patches.txt update

Minor update to Documentation/applying-patches.txt

Signed-off-by: Jesper Juhl <jesper.juhl@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/applying-patches.txt | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/Documentation/applying-patches.txt b/Documentation/applying-patches.txt
index 681e426e24..05a08c2c18 100644
--- a/Documentation/applying-patches.txt
+++ b/Documentation/applying-patches.txt
@@ -2,7 +2,8 @@
 	Applying Patches To The Linux Kernel
 	------------------------------------
 
-	(Written by Jesper Juhl, August 2005)
+	Original by: Jesper Juhl, August 2005
+	Last update: 2005-12-02
 
 
@@ -118,7 +119,7 @@ wrong.
 
 When patch encounters a change that it can't fix up with fuzz it rejects it
 outright and leaves a file with a .rej extension (a reject file). You can
-read this file to see exactely what change couldn't be applied, so you can
+read this file to see exactly what change couldn't be applied, so you can
 go fix it up by hand if you wish.
 
 If you don't have any third party patches applied to your kernel source, but
@@ -127,7 +128,7 @@ and have made no modifications yourself to the source files, then you should
 never see a fuzz or reject message from patch. If you do see such messages
 anyway, then there's a high risk that either your local source tree or the
 patch file is corrupted in some way. In that case you should probably try
-redownloading the patch and if things are still not OK then you'd be advised
+re-downloading the patch and if things are still not OK then you'd be advised
 to start with a fresh tree downloaded in full from kernel.org.
 
 Let's look a bit more at some of the messages patch can produce.
@@ -180,9 +181,11 @@ wish to apply.
 
 Are there any alternatives to `patch'?
 ---
- Yes there are alternatives. You can use the `interdiff' program
-(http://cyberelk.net/tim/patchutils/) to generate a patch representing the
-differences between two patches and then apply the result.
+ Yes there are alternatives.
+
+ You can use the `interdiff' program (http://cyberelk.net/tim/patchutils/) to
+generate a patch representing the differences between two patches and then
+apply the result.
 This will let you move from something like 2.6.12.2 to 2.6.12.3 in a single
 step. The -z flag to interdiff will even let you feed it patches in gzip or
 bzip2 compressed form directly without the use of zcat or bzcat or manual
@@ -197,7 +200,7 @@ do the additional steps since interdiff can get things wrong in some cases.
  Another alternative is `ketchup', which is a python script for automatic
 downloading and applying of patches (http://www.selenic.com/ketchup/).
 
-Other nice tools are diffstat which shows a summary of changes made by a
+ Other nice tools are diffstat which shows a summary of changes made by a
 patch, lsdiff which displays a short listing of affected files in a patch
 file, along with (optionally) the line numbers of the start of each patch
 and grepdiff which displays a list of the files modified by a patch where
@@ -258,7 +261,7 @@ $ patch -p1 -R < ../patch-2.6.11.1	# revert the 2.6.11.1 patch
 					# source dir is now 2.6.11
 $ patch -p1 < ../patch-2.6.12		# apply new 2.6.12 patch
 $ cd ..
-$ mv linux-2.6.11.1 inux-2.6.12		# rename source dir
+$ mv linux-2.6.11.1 linux-2.6.12		# rename source dir
 
 
 The 2.6.x.y kernels
@@ -433,7 +436,11 @@ $ cd ..
 $ mv linux-2.6.12-mm1 linux-2.6.13-rc3-mm3	# rename the source dir
 
 
-This concludes this list of explanations of the various kernel trees and I
-hope you are now crystal clear on how to apply the various patches and help
-testing the kernel.
+This concludes this list of explanations of the various kernel trees.
+I hope you are now clear on how to apply the various patches and help testing
+the kernel.
+
+Thank you's to Randy Dunlap, Rolf Eike Beer, Linus Torvalds, Bodo Eggert,
+Johannes Stezenbach, Grant Coady, Pavel Machek and others that I may have
+forgotten for their reviews and contributions to this document.
 
-- 
cgit v1.2.2


From e78c9a004aadebe22306c81d1a7f1d1278dc37f9 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Sun, 8 Jan 2006 01:03:39 -0800
Subject: [PATCH] fs: remove s_old_blocksize from struct super_block

This patch inlines the single user of struct super_block field
s_old_blocksize and removes the field.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/super.c         | 3 +--
 include/linux/fs.h | 1 -
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/fs/super.c b/fs/super.c
index 5a347a4f67..0a30e51692 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -700,8 +700,7 @@ struct super_block *get_sb_bdev(struct file_system_type *fs_type,
 
 		s->s_flags = flags;
 		strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
-		s->s_old_blocksize = block_size(bdev);
-		sb_set_blocksize(s, s->s_old_blocksize);
+		sb_set_blocksize(s, block_size(bdev));
 		error = fill_super(s, data, flags & MS_VERBOSE ? 1 : 0);
 		if (error) {
 			up_write(&s->s_umount);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a1e28f0895..4c82219b0f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -808,7 +808,6 @@ struct super_block {
 	struct list_head	s_list;		/* Keep this first */
 	dev_t			s_dev;		/* search index; _not_ kdev_t */
 	unsigned long		s_blocksize;
-	unsigned long		s_old_blocksize;
 	unsigned char		s_blocksize_bits;
 	unsigned char		s_dirt;
 	unsigned long long	s_maxbytes;	/* Max file size */
-- 
cgit v1.2.2


From f867bac65419a98c9682f4409e087582d29ec5f6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Sun, 8 Jan 2006 01:03:40 -0800
Subject: [PATCH] remove unused blkp field in percpu_data

I found that blkp field was not used in kernel tree.

As most of the times NR_CPUS is a power of two and kmalloc() memory blocks
too, this extra field basically doubles the memory space allocated in
__alloc_percpu() to store the 'struct percpu_data'

(for example, if NR_CPUS=8 on i386, kmalloc(4*8+4) returns a 64 bytes block
instead of a 32 bytes block after this patch)

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/percpu.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 20317d88de..cb9039a21f 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -19,7 +19,6 @@
 
 struct percpu_data {
 	void *ptrs[NR_CPUS];
-	void *blkp;
 };
 
 /* 
-- 
cgit v1.2.2


From 9841d61d75da5e46ed7a978bed4f50c78b1d87fd Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Sun, 8 Jan 2006 01:03:41 -0800
Subject: [PATCH] Add tainting for proprietary helper modules

Kernels that have had Windows drivers loaded into them are undebuggable.
I've wasted a number of hours chasing bugs filed in Fedora bugzilla only to
find out much later that the user had used such 'helpers', and their
problems were unreproducable without them loaded.

Acked-by: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Dave Jones <davej@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/module.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/kernel/module.c b/kernel/module.c
index 4b06bbad49..5d9078d6f0 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1715,6 +1715,11 @@ static struct module *load_module(void __user *umod,
 	/* Set up license info based on the info section */
 	set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
 
+	if (strcmp(mod->name, "ndiswrapper") == 0)
+		add_taint(TAINT_PROPRIETARY_MODULE);
+	if (strcmp(mod->name, "driverloader") == 0)
+		add_taint(TAINT_PROPRIETARY_MODULE);
+
 #ifdef CONFIG_MODULE_UNLOAD
 	/* Set up MODINFO_ATTR fields */
 	setup_modinfo(mod, sechdrs, infoindex);
-- 
cgit v1.2.2


From d84f520348d77e61f936227a048403dbc349fff1 Mon Sep 17 00:00:00 2001
From: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Date: Sun, 8 Jan 2006 01:03:42 -0800
Subject: [PATCH] Extend RCU torture module to test tickless idle CPU

This patch forces RCU torture threads off various CPUs in the system
allowing them to become idle and go tickless.  Meant to test support for
such tickless idle CPU in RCU.

Signed-off-by: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Cc: Dipankar Sarma <dipankar@in.ibm.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/rcutorture.c | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 91 insertions(+), 5 deletions(-)

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 75174c8152..773219907d 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -48,9 +48,11 @@
 MODULE_LICENSE("GPL");
 
 static int nreaders = -1;	/* # reader threads, defaults to 4*ncpus */
-static int stat_interval = 0;	/* Interval between stats, in seconds. */
+static int stat_interval;	/* Interval between stats, in seconds. */
 				/*  Defaults to "only at end of test". */
-static int verbose = 0;		/* Print more debug info. */
+static int verbose;		/* Print more debug info. */
+static int test_no_idle_hz;	/* Test RCU's support for tickless idle CPUs. */
+static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/
 
 MODULE_PARM(nreaders, "i");
 MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
@@ -58,6 +60,10 @@ MODULE_PARM(stat_interval, "i");
 MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
 MODULE_PARM(verbose, "i");
 MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
+MODULE_PARM(test_no_idle_hz, "i");
+MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
+MODULE_PARM(shuffle_interval, "i");
+MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
 #define TORTURE_FLAG "rcutorture: "
 #define PRINTK_STRING(s) \
 	do { printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0)
@@ -72,6 +78,7 @@ static int nrealreaders;
 static struct task_struct *writer_task;
 static struct task_struct **reader_tasks;
 static struct task_struct *stats_task;
+static struct task_struct *shuffler_task;
 
 #define RCU_TORTURE_PIPE_LEN 10
 
@@ -375,12 +382,77 @@ rcu_torture_stats(void *arg)
 	return 0;
 }
 
+static int rcu_idle_cpu;	/* Force all torture tasks off this CPU */
+
+/* Shuffle tasks such that we allow @rcu_idle_cpu to become idle. A special case
+ * is when @rcu_idle_cpu = -1, when we allow the tasks to run on all CPUs.
+ */
+void rcu_torture_shuffle_tasks(void)
+{
+	cpumask_t tmp_mask = CPU_MASK_ALL;
+	int i;
+
+	lock_cpu_hotplug();
+
+	/* No point in shuffling if there is only one online CPU (ex: UP) */
+	if (num_online_cpus() == 1) {
+		unlock_cpu_hotplug();
+		return;
+	}
+
+	if (rcu_idle_cpu != -1)
+		cpu_clear(rcu_idle_cpu, tmp_mask);
+
+	set_cpus_allowed(current, tmp_mask);
+
+	if (reader_tasks != NULL) {
+		for (i = 0; i < nrealreaders; i++)
+			if (reader_tasks[i])
+				set_cpus_allowed(reader_tasks[i], tmp_mask);
+	}
+
+	if (writer_task)
+		set_cpus_allowed(writer_task, tmp_mask);
+
+	if (stats_task)
+		set_cpus_allowed(stats_task, tmp_mask);
+
+	if (rcu_idle_cpu == -1)
+		rcu_idle_cpu = num_online_cpus() - 1;
+	else
+		rcu_idle_cpu--;
+
+	unlock_cpu_hotplug();
+}
+
+/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the
+ * system to become idle at a time and cut off its timer ticks. This is meant
+ * to test the support for such tickless idle CPU in RCU.
+ */
+static int
+rcu_torture_shuffle(void *arg)
+{
+	VERBOSE_PRINTK_STRING("rcu_torture_shuffle task started");
+	do {
+		schedule_timeout_interruptible(shuffle_interval * HZ);
+		rcu_torture_shuffle_tasks();
+	} while (!kthread_should_stop());
+	VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping");
+	return 0;
+}
+
 static void
 rcu_torture_cleanup(void)
 {
 	int i;
 
 	fullstop = 1;
+	if (shuffler_task != NULL) {
+		VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task");
+		kthread_stop(shuffler_task);
+	}
+	shuffler_task = NULL;
+
 	if (writer_task != NULL) {
 		VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task");
 		kthread_stop(writer_task);
@@ -429,9 +501,11 @@ rcu_torture_init(void)
 		nrealreaders = nreaders;
 	else
 		nrealreaders = 2 * num_online_cpus();
-	printk(KERN_ALERT TORTURE_FLAG
-	       "--- Start of test: nreaders=%d stat_interval=%d verbose=%d\n",
-	       nrealreaders, stat_interval, verbose);
+	printk(KERN_ALERT TORTURE_FLAG "--- Start of test: nreaders=%d "
+		"stat_interval=%d verbose=%d test_no_idle_hz=%d "
+		"shuffle_interval = %d\n",
+		nrealreaders, stat_interval, verbose, test_no_idle_hz,
+		shuffle_interval);
 	fullstop = 0;
 
 	/* Set up the freelist. */
@@ -501,6 +575,18 @@ rcu_torture_init(void)
 			goto unwind;
 		}
 	}
+	if (test_no_idle_hz) {
+		rcu_idle_cpu = num_online_cpus() - 1;
+		/* Create the shuffler thread */
+		shuffler_task = kthread_run(rcu_torture_shuffle, NULL,
+					  "rcu_torture_shuffle");
+		if (IS_ERR(shuffler_task)) {
+			firsterr = PTR_ERR(shuffler_task);
+			VERBOSE_PRINTK_ERRSTRING("Failed to create shuffler");
+			shuffler_task = NULL;
+			goto unwind;
+		}
+	}
 	return 0;
 
 unwind:
-- 
cgit v1.2.2


From 99aef427e206f4ae7dcf0b18f66416145fea5d20 Mon Sep 17 00:00:00 2001
From: Rob Landley <rob@landley.net>
Date: Sun, 8 Jan 2006 01:03:43 -0800
Subject: [PATCH] update to the initramfs docs

Based on questions people have asked me.  Repeatedly.

Signed-off-by: Rob Landley <rob@landley.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 .../filesystems/ramfs-rootfs-initramfs.txt         | 72 +++++++++++++++++++++-
 1 file changed, 71 insertions(+), 1 deletion(-)

diff --git a/Documentation/filesystems/ramfs-rootfs-initramfs.txt b/Documentation/filesystems/ramfs-rootfs-initramfs.txt
index b3404a0325..60ab61e54e 100644
--- a/Documentation/filesystems/ramfs-rootfs-initramfs.txt
+++ b/Documentation/filesystems/ramfs-rootfs-initramfs.txt
@@ -143,12 +143,26 @@ as the following example:
   dir /mnt 755 0 0
   file /init initramfs/init.sh 755 0 0
 
+Run "usr/gen_init_cpio" (after the kernel build) to get a usage message
+documenting the above file format.
+
 One advantage of the text file is that root access is not required to
 set permissions or create device nodes in the new archive.  (Note that those
 two example "file" entries expect to find files named "init.sh" and "busybox" in
 a directory called "initramfs", under the linux-2.6.* directory.  See
 Documentation/early-userspace/README for more details.)
 
+The kernel does not depend on external cpio tools, gen_init_cpio is created
+from usr/gen_init_cpio.c which is entirely self-contained, and the kernel's
+boot-time extractor is also (obviously) self-contained.  However, if you _do_
+happen to have cpio installed, the following command line can extract the
+generated cpio image back into its component files:
+
+  cpio -i -d -H newc -F initramfs_data.cpio --no-absolute-filenames
+
+Contents of initramfs:
+----------------------
+
 If you don't already understand what shared libraries, devices, and paths
 you need to get a minimal root filesystem up and running, here are some
 references:
@@ -161,13 +175,69 @@ designed to be a tiny C library to statically link early userspace
 code against, along with some related utilities.  It is BSD licensed.
 
 I use uClibc (http://www.uclibc.org) and busybox (http://www.busybox.net)
-myself.  These are LGPL and GPL, respectively.
+myself.  These are LGPL and GPL, respectively.  (A self-contained initramfs
+package is planned for the busybox 1.2 release.)
 
 In theory you could use glibc, but that's not well suited for small embedded
 uses like this.  (A "hello world" program statically linked against glibc is
 over 400k.  With uClibc it's 7k.  Also note that glibc dlopens libnss to do
 name lookups, even when otherwise statically linked.)
 
+Why cpio rather than tar?
+-------------------------
+
+This decision was made back in December, 2001.  The discussion started here:
+
+  http://www.uwsg.iu.edu/hypermail/linux/kernel/0112.2/1538.html
+
+And spawned a second thread (specifically on tar vs cpio), starting here:
+
+  http://www.uwsg.iu.edu/hypermail/linux/kernel/0112.2/1587.html
+
+The quick and dirty summary version (which is no substitute for reading
+the above threads) is:
+
+1) cpio is a standard.  It's decades old (from the AT&T days), and already
+   widely used on Linux (inside RPM, Red Hat's device driver disks).  Here's
+   a Linux Journal article about it from 1996:
+
+      http://www.linuxjournal.com/article/1213
+
+   It's not as popular as tar because the traditional cpio command line tools
+   require _truly_hideous_ command line arguments.  But that says nothing
+   either way about the archive format, and there are alternative tools,
+   such as:
+
+     http://freshmeat.net/projects/afio/
+
+2) The cpio archive format chosen by the kernel is simpler and cleaner (and
+   thus easier to create and parse) than any of the (literally dozens of)
+   various tar archive formats.  The complete initramfs archive format is
+   explained in buffer-format.txt, created in usr/gen_init_cpio.c, and
+   extracted in init/initramfs.c.  All three together come to less than 26k
+   total of human-readable text.
+
+3) The GNU project standardizing on tar is approximately as relevant as
+   Windows standardizing on zip.  Linux is not part of either, and is free
+   to make its own technical decisions.
+
+4) Since this is a kernel internal format, it could easily have been
+   something brand new.  The kernel provides its own tools to create and
+   extract this format anyway.  Using an existing standard was preferable,
+   but not essential.
+
+5) Al Viro made the decision (quote: "tar is ugly as hell and not going to be
+   supported on the kernel side"):
+
+      http://www.uwsg.iu.edu/hypermail/linux/kernel/0112.2/1540.html
+
+   explained his reasoning:
+
+      http://www.uwsg.iu.edu/hypermail/linux/kernel/0112.2/1550.html
+      http://www.uwsg.iu.edu/hypermail/linux/kernel/0112.2/1638.html
+
+   and, most importantly, designed and implemented the initramfs code.
+
 Future directions:
 ------------------
 
-- 
cgit v1.2.2


From 87ba81dba431232548ce29d5d224115d0c2355ac Mon Sep 17 00:00:00 2001
From: Valentine Barshak <vbarshak@ru.mvista.com>
Date: Sun, 8 Jan 2006 01:03:44 -0800
Subject: [PATCH] fadvise: return ESPIPE on FIFO/pipe

The patch makes posix_fadvise return ESPIPE on FIFO/pipe in order to be
fully POSIX-compliant.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/fadvise.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/mm/fadvise.c b/mm/fadvise.c
index 5f19e87bc5..d257c89e77 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -37,6 +37,11 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
 	if (!file)
 		return -EBADF;
 
+	if (S_ISFIFO(file->f_dentry->d_inode->i_mode)) {
+		ret = -ESPIPE;
+		goto out;
+	}
+
 	mapping = file->f_mapping;
 	if (!mapping || len < 0) {
 		ret = -EINVAL;
-- 
cgit v1.2.2


From 5e38291d80086f6972f471c7caffa03184de0bf0 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sun, 8 Jan 2006 01:03:46 -0800
Subject: [PATCH] Don't attempt to power off if power off is not implemented

The problem.  It is expected that /sbin/halt -p works exactly like
/sbin/halt, when the kernel does not implement power off functionality.

The kernel can do a lot of work in the reboot notifiers and in
device_shutdown before we even get to machine_power_off.  Some of that
shutdown is not safe if you are leaving the power on, and it definitely
gets in the way of using sysrq or pressing ctrl-alt-del.  Since the
shutdown happens in generic code there is no way to fix this in
architecture specific code :(

Some machines are kernel oopsing today because of this.

The simple solution is to turn LINUX_REBOOT_CMD_POWER_OFF into
LINUX_REBOOT_CMD_HALT if power_off functionality is not implemented.

This has the unfortunate side effect of disabling the power off
functionality on architectures that leave pm_power_off to null and still
implement something in machine_power_off.  And it will break the build on
some architectures that don't have a pm_power_off variable at all.

On both counts I say tough.

For architectures like alpha that don't implement the pm_power_off variable
pm_power_off is declared in linux/pm.h and it is a generic part of our
power management code, and all architectures should implement it.

For architectures like parisc that have a default power off method in
machine_power_off if pm_power_off is not implemented or fails.  It is easy
enough to set the pm_power_off variable.  And nothing bad happens there,
the machines just stop powering off.

The current semantics are impossible without a flag at the top level so we
can avoid the problem code if a power off is not implemented.  pm_power_off
is as good a flag as any with the bonus that it works without modification
on at least x86, x86_64, powerpc, and ppc today.

Andrew can you pick this up and put this in the mm tree.  Kernels that
don't compile or don't power off seem saner than kernels that oops or
panic.  Until we get the arch specific patches for the problem
architectures this probably isn't smart to push into the stable kernel.
Unfortunately I don't have the time at the moment to walk through every
architecture and make them work.  And even if I did I couldn't test it :(

From: Hirokazu Takata <takata@linux-m32r.org>

    Add pm_power_off() for build fix of arch/m32r/kernel/process.c.

From: Miklos Szeredi <miklos@szeredi.hu>

    UML build fix

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Hayato Fujiwara <fujiwara@linux-m32r.org>
Signed-off-by: Hirokazu Takata <takata@linux-m32r.org>
Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/alpha/kernel/process.c | 5 +++++
 arch/m32r/kernel/process.c  | 4 ++++
 arch/um/kernel/reboot.c     | 2 ++
 kernel/sys.c                | 6 ++++++
 4 files changed, 17 insertions(+)

diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index a8682612ab..abb739b88e 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -43,6 +43,11 @@
 #include "proto.h"
 #include "pci_impl.h"
 
+/*
+ * Power off function, if any
+ */
+void (*pm_power_off)(void) = machine_power_off;
+
 void
 cpu_idle(void)
 {
diff --git a/arch/m32r/kernel/process.c b/arch/m32r/kernel/process.c
index cc4b571e5d..3bf55d9293 100644
--- a/arch/m32r/kernel/process.c
+++ b/arch/m32r/kernel/process.c
@@ -50,6 +50,10 @@ unsigned long thread_saved_pc(struct task_struct *tsk)
  * Powermanagement idle function, if any..
  */
 void (*pm_idle)(void) = NULL;
+EXPORT_SYMBOL(pm_idle);
+
+void (*pm_power_off)(void) = NULL;
+EXPORT_SYMBOL(pm_power_off);
 
 void disable_hlt(void)
 {
diff --git a/arch/um/kernel/reboot.c b/arch/um/kernel/reboot.c
index a637e885c5..6f1a3a2881 100644
--- a/arch/um/kernel/reboot.c
+++ b/arch/um/kernel/reboot.c
@@ -12,6 +12,8 @@
 #include "mode.h"
 #include "choose-mode.h"
 
+void (*pm_power_off)(void);
+
 #ifdef CONFIG_SMP
 static void kill_idlers(int me)
 {
diff --git a/kernel/sys.c b/kernel/sys.c
index eecf84526a..d8e49e6590 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -489,6 +489,12 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
 	                magic2 != LINUX_REBOOT_MAGIC2C))
 		return -EINVAL;
 
+	/* Instead of trying to make the power_off code look like
+	 * halt when pm_power_off is not set do it the easy way.
+	 */
+	if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off)
+		cmd = LINUX_REBOOT_CMD_HALT;
+
 	lock_kernel();
 	switch (cmd) {
 	case LINUX_REBOOT_CMD_RESTART:
-- 
cgit v1.2.2


From d09cf7d77f62f6fb2f6d63fe5980583805f2d559 Mon Sep 17 00:00:00 2001
From: Kylene Jo Hall <kjhall@us.ibm.com>
Date: Sun, 8 Jan 2006 01:03:48 -0800
Subject: [PATCH] tpmdd: remove global event log

Remove global event log in the tpm bios event measurement log code that
would have caused problems when the code was run concurrently.  A log is
now allocated and attached to the seq file upon open and destroyed
appropriately.

Signed-off-by: Kylene Jo Hall <kjhall@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/tpm/tpm_bios.c | 92 ++++++++++++++++++++++++++++++---------------
 1 file changed, 62 insertions(+), 30 deletions(-)

diff --git a/drivers/char/tpm/tpm_bios.c b/drivers/char/tpm/tpm_bios.c
index 5e1f4dfefa..aedf7a8e6d 100644
--- a/drivers/char/tpm/tpm_bios.c
+++ b/drivers/char/tpm/tpm_bios.c
@@ -29,6 +29,11 @@
 #define MAX_TEXT_EVENT		1000	/* Max event string length */
 #define ACPI_TCPA_SIG		"TCPA"	/* 0x41504354 /'TCPA' */
 
+struct tpm_bios_log {
+	void *bios_event_log;
+	void *bios_event_log_end;
+};
+
 struct acpi_tcpa {
 	struct acpi_table_header hdr;
 	u16 reserved;
@@ -117,39 +122,34 @@ static const char* tcpa_pc_event_id_strings[] = {
 	"S-CRTM POST Contents",
 };
 
-/* (Binary) bios measurement buffer */
-static void *tcg_eventlog;
-static void *tcg_eventlog_addr_limit;	/* MAX */
-
 /* returns pointer to start of pos. entry of tcg log */
 static void *tpm_bios_measurements_start(struct seq_file *m, loff_t *pos)
 {
 	loff_t i;
-	void *addr = tcg_eventlog;
+	struct tpm_bios_log *log = m->private;
+	void *addr = log->bios_event_log;
+	void *limit = log->bios_event_log_end;
 	struct tcpa_event *event;
 
 	/* read over *pos measurements */
 	for (i = 0; i < *pos; i++) {
 		event = addr;
 
-		if ((addr + sizeof(struct tcpa_event)) <
-		    tcg_eventlog_addr_limit) {
+		if ((addr + sizeof(struct tcpa_event)) < limit) {
 			if (event->event_type == 0 && event->event_size == 0)
 				return NULL;
-			addr +=
-			    sizeof(struct tcpa_event) + event->event_size;
+			addr += sizeof(struct tcpa_event) + event->event_size;
 		}
 	}
 
 	/* now check if current entry is valid */
-	if ((addr + sizeof(struct tcpa_event)) >= tcg_eventlog_addr_limit)
+	if ((addr + sizeof(struct tcpa_event)) >= limit)
 		return NULL;
 
 	event = addr;
 
 	if ((event->event_type == 0 && event->event_size == 0) ||
-	    ((addr + sizeof(struct tcpa_event) + event->event_size) >=
-	     tcg_eventlog_addr_limit))
+	    ((addr + sizeof(struct tcpa_event) + event->event_size) >= limit))
 		return NULL;
 
 	return addr;
@@ -159,11 +159,13 @@ static void *tpm_bios_measurements_next(struct seq_file *m, void *v,
 					loff_t *pos)
 {
 	struct tcpa_event *event = v;
+	struct tpm_bios_log *log = m->private;
+	void *limit = log->bios_event_log_end;
 
 	v += sizeof(struct tcpa_event) + event->event_size;
 
 	/* now check if current entry is valid */
-	if ((v + sizeof(struct tcpa_event)) >= tcg_eventlog_addr_limit)
+	if ((v + sizeof(struct tcpa_event)) >= limit)
 		return NULL;
 
 	event = v;
@@ -172,8 +174,7 @@ static void *tpm_bios_measurements_next(struct seq_file *m, void *v,
 		return NULL;
 
 	if ((event->event_type == 0 && event->event_size == 0) ||
-	    ((v + sizeof(struct tcpa_event) + event->event_size) >=
-	     tcg_eventlog_addr_limit))
+	    ((v + sizeof(struct tcpa_event) + event->event_size) >= limit))
 		return NULL;
 
 	(*pos)++;
@@ -312,10 +313,14 @@ static int tpm_binary_bios_measurements_show(struct seq_file *m, void *v)
 static int tpm_bios_measurements_release(struct inode *inode,
 					 struct file *file)
 {
-	if (tcg_eventlog) {
-		kfree(tcg_eventlog);
-		tcg_eventlog = NULL;
+	struct seq_file *seq = file->private_data;
+	struct tpm_bios_log *log = seq->private;
+
+	if (log) {
+		kfree(log->bios_event_log);
+		kfree(log);
 	}
+
 	return seq_release(inode, file);
 }
 
@@ -367,13 +372,13 @@ static struct seq_operations tpm_binary_b_measurments_seqops = {
 };
 
 /* read binary bios log */
-static int read_log(void)
+static int read_log(struct tpm_bios_log *log)
 {
 	struct acpi_tcpa *buff;
 	acpi_status status;
 	void *virt;
 
-	if (tcg_eventlog != NULL) {
+	if (log->bios_event_log != NULL) {
 		printk(KERN_ERR
 		       "%s: ERROR - Eventlog already initialized\n",
 		       __func__);
@@ -393,25 +398,24 @@ static int read_log(void)
 	}
 
 	if (buff->log_max_len == 0) {
-		printk(KERN_ERR "%s: ERROR - TCPA log area empty\n",
-		       __func__);
+		printk(KERN_ERR "%s: ERROR - TCPA log area empty\n", __func__);
 		return -EIO;
 	}
 
 	/* malloc EventLog space */
-	tcg_eventlog = kmalloc(buff->log_max_len, GFP_KERNEL);
-	if (!tcg_eventlog) {
+	log->bios_event_log = kmalloc(buff->log_max_len, GFP_KERNEL);
+	if (!log->bios_event_log) {
 		printk
 		    ("%s: ERROR - Not enough  Memory for BIOS measurements\n",
 		     __func__);
 		return -ENOMEM;
 	}
 
-	tcg_eventlog_addr_limit = tcg_eventlog + buff->log_max_len;
+	log->bios_event_log_end = log->bios_event_log + buff->log_max_len;
 
 	acpi_os_map_memory(buff->log_start_addr, buff->log_max_len, &virt);
 
-	memcpy(tcg_eventlog, virt, buff->log_max_len);
+	memcpy(log->bios_event_log, virt, buff->log_max_len);
 
 	acpi_os_unmap_memory(virt, buff->log_max_len);
 	return 0;
@@ -421,12 +425,26 @@ static int tpm_ascii_bios_measurements_open(struct inode *inode,
 					    struct file *file)
 {
 	int err;
+	struct tpm_bios_log *log;
+	struct seq_file *seq;
 
-	if ((err = read_log()))
+	log = kzalloc(sizeof(struct tpm_bios_log), GFP_KERNEL);
+	if (!log)
+		return -ENOMEM;
+
+	if ((err = read_log(log)))
 		return err;
 
 	/* now register seq file */
-	return seq_open(file, &tpm_ascii_b_measurments_seqops);
+	err = seq_open(file, &tpm_ascii_b_measurments_seqops);
+	if (!err) {
+		seq = file->private_data;
+		seq->private = log;
+	} else {
+		kfree(log->bios_event_log);
+		kfree(log);
+	}
+	return err;
 }
 
 struct file_operations tpm_ascii_bios_measurements_ops = {
@@ -440,12 +458,26 @@ static int tpm_binary_bios_measurements_open(struct inode *inode,
 					     struct file *file)
 {
 	int err;
+	struct tpm_bios_log *log;
+	struct seq_file *seq;
 
-	if ((err = read_log()))
+	log = kzalloc(sizeof(struct tpm_bios_log), GFP_KERNEL);
+	if (!log)
+		return -ENOMEM;
+
+	if ((err = read_log(log)))
 		return err;
 
 	/* now register seq file */
-	return seq_open(file, &tpm_binary_b_measurments_seqops);
+	err = seq_open(file, &tpm_binary_b_measurments_seqops);
+	if (!err) {
+		seq = file->private_data;
+		seq->private = log;
+	} else {
+		kfree(log->bios_event_log);
+		kfree(log);
+	}
+	return err;
 }
 
 struct file_operations tpm_binary_bios_measurements_ops = {
-- 
cgit v1.2.2


From fb86a35b9ded8a7e53a432cbf28df603cdd4849c Mon Sep 17 00:00:00 2001
From: Mike Miller <mike.miller@hp.com>
Date: Sun, 8 Jan 2006 01:03:50 -0800
Subject: [PATCH] cciss: adds MSI and MSI-X support

This creates a new function, cciss_interrupt_mode called from
cciss_pci_init.  This function determines what type of interrupt vector to
use, i.e., MSI, MSI-X, or IO-APIC.

One noticeable difference is changing the interrupt field of the controller
struct to an array of 4 unsigned ints.  The Smart Array HW is capable of
generating 4 distinct interrupts depending on the transport method in use
during operation.  These are:

#define DOORBELL_INT 0
Used to notify the contoller of configuration updates. We only use
this feature when in polling mode.

#define PERF_MODE_INT 0
Used when the controller is in Performant Mode.

#define SIMPLE_MODE_INT 2
Used when the controller is in Simple Mode (current Linux implementation).

#define MEMQ_INT_MODE 3
Not used.

When using IO-APIC interrupts these 4 lines are OR'ed together so when any
one fires an interrupt an is generated.  In MSI or MSI-X mode this hardware
OR'ing is ignored.  We must register for our interrupt depending on what
mode the controller is running.  For Linux we use SIMPLE_MODE_INT
exclusively at this time.  Please consider this for inclusion.

Signed-off-by: Mike Miller <mike.miller@hp.com>
Cc: Jens Axboe <axboe@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/block/cciss.c      | 87 ++++++++++++++++++++++++++++++++++++++++------
 drivers/block/cciss.h      |  8 ++++-
 drivers/block/cciss_scsi.c |  2 +-
 3 files changed, 84 insertions(+), 13 deletions(-)

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index bdb9c2717d..46e8356a96 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -1,6 +1,6 @@
 /*
  *    Disk Array driver for HP SA 5xxx and 6xxx Controllers
- *    Copyright 2000, 2005 Hewlett-Packard Development Company, L.P.
+ *    Copyright 2000, 2006 Hewlett-Packard Development Company, L.P.
  *
  *    This program is free software; you can redistribute it and/or modify
  *    it under the terms of the GNU General Public License as published by
@@ -47,12 +47,12 @@
 #include <linux/completion.h>
 
 #define CCISS_DRIVER_VERSION(maj,min,submin) ((maj<<16)|(min<<8)|(submin))
-#define DRIVER_NAME "HP CISS Driver (v 2.6.8)"
-#define DRIVER_VERSION CCISS_DRIVER_VERSION(2,6,8)
+#define DRIVER_NAME "HP CISS Driver (v 2.6.10)"
+#define DRIVER_VERSION CCISS_DRIVER_VERSION(2,6,10)
 
 /* Embedded module documentation macros - see modules.h */
 MODULE_AUTHOR("Hewlett-Packard Company");
-MODULE_DESCRIPTION("Driver for HP Controller SA5xxx SA6xxx version 2.6.8");
+MODULE_DESCRIPTION("Driver for HP Controller SA5xxx SA6xxx version 2.6.10");
 MODULE_SUPPORTED_DEVICE("HP SA5i SA5i+ SA532 SA5300 SA5312 SA641 SA642 SA6400"
 			" SA6i P600 P800 P400 P400i E200 E200i");
 MODULE_LICENSE("GPL");
@@ -167,7 +167,7 @@ static void cciss_geometry_inquiry(int ctlr, int logvol,
 			unsigned int block_size, InquiryData_struct *inq_buff,
 			drive_info_struct *drv);
 static void cciss_getgeometry(int cntl_num);
-
+static void __devinit cciss_interrupt_mode(ctlr_info_t *, struct pci_dev *, __u32);
 static void start_io( ctlr_info_t *h);
 static int sendcmd( __u8 cmd, int ctlr, void *buff, size_t size,
 	unsigned int use_unit_num, unsigned int log_unit, __u8 page_code,
@@ -284,7 +284,7 @@ static int cciss_proc_get_info(char *buffer, char **start, off_t offset,
                 h->product_name,
                 (unsigned long)h->board_id,
 		h->firm_ver[0], h->firm_ver[1], h->firm_ver[2], h->firm_ver[3],
-                (unsigned int)h->intr,
+                (unsigned int)h->intr[SIMPLE_MODE_INT],
                 h->num_luns, 
 		h->Qdepth, h->commands_outstanding,
 		h->maxQsinceinit, h->max_outstanding, h->maxSG);
@@ -2662,6 +2662,60 @@ static int find_PCI_BAR_index(struct pci_dev *pdev,
 	return -1;
 }
 
+/* If MSI/MSI-X is supported by the kernel we will try to enable it on
+ * controllers that are capable. If not, we use IO-APIC mode.
+ */
+
+static void __devinit cciss_interrupt_mode(ctlr_info_t *c, struct pci_dev *pdev, __u32 board_id)
+{
+#ifdef CONFIG_PCI_MSI
+        int err;
+        struct msix_entry cciss_msix_entries[4] = {{0,0}, {0,1},
+						   {0,2}, {0,3}};
+
+	/* Some boards advertise MSI but don't really support it */
+	if ((board_id == 0x40700E11) ||
+		(board_id == 0x40800E11) ||
+		(board_id == 0x40820E11) ||
+		(board_id == 0x40830E11))
+		goto default_int_mode;
+
+        if (pci_find_capability(pdev, PCI_CAP_ID_MSIX)) {
+                err = pci_enable_msix(pdev, cciss_msix_entries, 4);
+                if (!err) {
+                        c->intr[0] = cciss_msix_entries[0].vector;
+                        c->intr[1] = cciss_msix_entries[1].vector;
+                        c->intr[2] = cciss_msix_entries[2].vector;
+                        c->intr[3] = cciss_msix_entries[3].vector;
+                        c->msix_vector = 1;
+                        return;
+                }
+                if (err > 0) {
+                        printk(KERN_WARNING "cciss: only %d MSI-X vectors "
+                                        "available\n", err);
+                } else {
+                        printk(KERN_WARNING "cciss: MSI-X init failed %d\n",
+						err);
+                }
+        }
+        if (pci_find_capability(pdev, PCI_CAP_ID_MSI)) {
+                if (!pci_enable_msi(pdev)) {
+                        c->intr[SIMPLE_MODE_INT] = pdev->irq;
+                        c->msi_vector = 1;
+                        return;
+                } else {
+                        printk(KERN_WARNING "cciss: MSI init failed\n");
+        		c->intr[SIMPLE_MODE_INT] = pdev->irq;
+                        return;
+                }
+        }
+#endif /* CONFIG_PCI_MSI */
+	/* if we get here we're going to use the default interrupt mode */
+default_int_mode:
+        c->intr[SIMPLE_MODE_INT] = pdev->irq;
+	return;
+}
+
 static int cciss_pci_init(ctlr_info_t *c, struct pci_dev *pdev)
 {
 	ushort subsystem_vendor_id, subsystem_device_id, command;
@@ -2722,7 +2776,10 @@ static int cciss_pci_init(ctlr_info_t *c, struct pci_dev *pdev)
 	printk("board_id = %x\n", board_id);
 #endif /* CCISS_DEBUG */ 
 
-	c->intr = pdev->irq;
+/* If the kernel supports MSI/MSI-X we will try to enable that functionality,
+ * else we use the IO-APIC interrupt assigned to us by system ROM.
+ */
+	cciss_interrupt_mode(c, pdev, board_id);
 
 	/*
 	 * Memory base addr is first addr , the second points to the config
@@ -3076,11 +3133,11 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
 
 	/* make sure the board interrupts are off */
 	hba[i]->access.set_intr_mask(hba[i], CCISS_INTR_OFF);
-	if( request_irq(hba[i]->intr, do_cciss_intr, 
+	if( request_irq(hba[i]->intr[SIMPLE_MODE_INT], do_cciss_intr,
 		SA_INTERRUPT | SA_SHIRQ | SA_SAMPLE_RANDOM, 
 			hba[i]->devname, hba[i])) {
 		printk(KERN_ERR "cciss: Unable to get irq %d for %s\n",
-			hba[i]->intr, hba[i]->devname);
+			hba[i]->intr[SIMPLE_MODE_INT], hba[i]->devname);
 		goto clean2;
 	}
 	hba[i]->cmd_pool_bits = kmalloc(((NR_CMDS+BITS_PER_LONG-1)/BITS_PER_LONG)*sizeof(unsigned long), GFP_KERNEL);
@@ -3186,7 +3243,7 @@ clean4:
 			NR_CMDS * sizeof( ErrorInfo_struct),
 			hba[i]->errinfo_pool,
 			hba[i]->errinfo_pool_dhandle);
-	free_irq(hba[i]->intr, hba[i]);
+	free_irq(hba[i]->intr[SIMPLE_MODE_INT], hba[i]);
 clean2:
 	unregister_blkdev(hba[i]->major, hba[i]->devname);
 clean1:
@@ -3227,7 +3284,15 @@ static void __devexit cciss_remove_one (struct pci_dev *pdev)
 		printk(KERN_WARNING "Error Flushing cache on controller %d\n", 
 			i);
 	}
-	free_irq(hba[i]->intr, hba[i]);
+	free_irq(hba[i]->intr[2], hba[i]);
+
+#ifdef CONFIG_PCI_MSI
+        if (hba[i]->msix_vector)
+                pci_disable_msix(hba[i]->pdev);
+        else if (hba[i]->msi_vector)
+                pci_disable_msi(hba[i]->pdev);
+#endif /* CONFIG_PCI_MSI */
+
 	pci_set_drvdata(pdev, NULL);
 	iounmap(hba[i]->vaddr);
 	cciss_unregister_scsi(i);  /* unhook from SCSI subsystem */
diff --git a/drivers/block/cciss.h b/drivers/block/cciss.h
index 3b0858c838..ad45e581a9 100644
--- a/drivers/block/cciss.h
+++ b/drivers/block/cciss.h
@@ -65,7 +65,6 @@ struct ctlr_info
 	unsigned long io_mem_addr;
 	unsigned long io_mem_length;
 	CfgTable_struct __iomem *cfgtable;
-	unsigned int intr;
 	int	interrupts_enabled;
 	int	major;
 	int 	max_commands;
@@ -74,6 +73,13 @@ struct ctlr_info
 	int	num_luns;
 	int 	highest_lun;
 	int	usage_count;  /* number of opens all all minor devices */
+#	define DOORBELL_INT	0
+#	define PERF_MODE_INT	1
+#	define SIMPLE_MODE_INT	2
+#	define MEMQ_MODE_INT	3
+	unsigned int intr[4];
+	unsigned int msix_vector;
+	unsigned int msi_vector;
 
 	// information about each logical volume
 	drive_info_struct drv[CISS_MAX_LUN];
diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c
index 2942d32280..9e35de05d5 100644
--- a/drivers/block/cciss_scsi.c
+++ b/drivers/block/cciss_scsi.c
@@ -714,7 +714,7 @@ cciss_scsi_detect(int ctlr)
 	((struct cciss_scsi_adapter_data_t *) 
 		hba[ctlr]->scsi_ctlr)->scsi_host = (void *) sh;
 	sh->hostdata[0] = (unsigned long) hba[ctlr];
-	sh->irq = hba[ctlr]->intr;
+	sh->irq = hba[ctlr]->intr[SIMPLE_MODE_INT];
 	sh->unique_id = sh->irq;
 	error = scsi_add_host(sh, &hba[ctlr]->pdev->dev);
 	if (error)
-- 
cgit v1.2.2


From 9a5d3023e626a0baf86ac6b892c983b3db13f22b Mon Sep 17 00:00:00 2001
From: Oren Laadan <orenl@cs.columbia.edu>
Date: Sun, 8 Jan 2006 01:03:51 -0800
Subject: [PATCH] fork: fix race in setting child's pgrp and tty

In fork, child should recopy parent's pgrp/tty after it has tasklist_lock.
Otherwise following a setpgid() on the parent, *after* copy_signal(), the
child will own a stale pgrp (which may be reused); (eg.  if copy_mm()
sleeps a long while due to memory pressure).  Similar issue for the tty.

Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index 7992ee759d..4bc0bd8ef1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -811,9 +811,6 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 	sig->it_prof_expires = cputime_zero;
 	sig->it_prof_incr = cputime_zero;
 
-	sig->tty = current->signal->tty;
-	sig->pgrp = process_group(current);
-	sig->session = current->signal->session;
 	sig->leader = 0;	/* session leadership doesn't inherit */
 	sig->tty_old_pgrp = 0;
 
@@ -1136,15 +1133,15 @@ static task_t *copy_process(unsigned long clone_flags,
 	attach_pid(p, PIDTYPE_PID, p->pid);
 	attach_pid(p, PIDTYPE_TGID, p->tgid);
 	if (thread_group_leader(p)) {
+		p->signal->tty = current->signal->tty;
+		p->signal->pgrp = process_group(current);
+		p->signal->session = current->signal->session;
 		attach_pid(p, PIDTYPE_PGID, process_group(p));
 		attach_pid(p, PIDTYPE_SID, p->signal->session);
 		if (p->pid)
 			__get_cpu_var(process_counts)++;
 	}
 
-	if (!current->signal->tty && p->signal->tty)
-		p->signal->tty = NULL;
-
 	nr_threads++;
 	total_forks++;
 	write_unlock_irq(&tasklist_lock);
-- 
cgit v1.2.2


From ee0acf90d320c29916ba8c5c1b2e908d81f5057d Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 8 Jan 2006 01:03:53 -0800
Subject: [PATCH] setpgid: should work for sub-threads

setpgid(0, pgid) or setpgid(forked_child_pid, pgid) does not work unless
the calling process is a thread_group_leader().

'man setpgid' does not tell anything about that, so I consider this
behaviour is a bug.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Oren Laadan <orenl@cs.columbia.edu>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sys.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/kernel/sys.c b/kernel/sys.c
index d8e49e6590..43e557211d 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1090,10 +1090,11 @@ asmlinkage long sys_times(struct tms __user * tbuf)
 asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
 {
 	struct task_struct *p;
+	struct task_struct *group_leader = current->group_leader;
 	int err = -EINVAL;
 
 	if (!pid)
-		pid = current->pid;
+		pid = group_leader->pid;
 	if (!pgid)
 		pgid = pid;
 	if (pgid < 0)
@@ -1113,16 +1114,16 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
 	if (!thread_group_leader(p))
 		goto out;
 
-	if (p->parent == current || p->real_parent == current) {
+	if (p->parent == current || p->real_parent == group_leader) {
 		err = -EPERM;
-		if (p->signal->session != current->signal->session)
+		if (p->signal->session != group_leader->signal->session)
 			goto out;
 		err = -EACCES;
 		if (p->did_exec)
 			goto out;
 	} else {
 		err = -ESRCH;
-		if (p != current)
+		if (p != group_leader)
 			goto out;
 	}
 
@@ -1134,7 +1135,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
 		struct task_struct *p;
 
 		do_each_task_pid(pgid, PIDTYPE_PGID, p) {
-			if (p->signal->session == current->signal->session)
+			if (p->signal->session == group_leader->signal->session)
 				goto ok_pgid;
 		} while_each_task_pid(pgid, PIDTYPE_PGID, p);
 		goto out;
-- 
cgit v1.2.2


From e19f247a3dbd85485ec13174817ae9c2478fe541 Mon Sep 17 00:00:00 2001
From: Oren Laadan <orenl@cs.columbia.edu>
Date: Sun, 8 Jan 2006 01:03:58 -0800
Subject: [PATCH] setpgid: should work for sub-threads

setsid() does not work unless the calling process is a
thread_group_leader().

'man setpgid' does not tell anything about that, so I consider this
behaviour is a bug.

Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c |  2 +-
 kernel/sys.c  | 16 +++++++---------
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/kernel/exit.c b/kernel/exit.c
index a80824f610..caceabf3f2 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -257,7 +257,7 @@ static inline void reparent_to_init(void)
 
 void __set_special_pids(pid_t session, pid_t pgrp)
 {
-	struct task_struct *curr = current;
+	struct task_struct *curr = current->group_leader;
 
 	if (curr->signal->session != session) {
 		detach_pid(curr, PIDTYPE_SID);
diff --git a/kernel/sys.c b/kernel/sys.c
index 43e557211d..f497bf56ad 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1215,24 +1215,22 @@ asmlinkage long sys_getsid(pid_t pid)
 
 asmlinkage long sys_setsid(void)
 {
+	struct task_struct *group_leader = current->group_leader;
 	struct pid *pid;
 	int err = -EPERM;
 
-	if (!thread_group_leader(current))
-		return -EINVAL;
-
 	down(&tty_sem);
 	write_lock_irq(&tasklist_lock);
 
-	pid = find_pid(PIDTYPE_PGID, current->pid);
+	pid = find_pid(PIDTYPE_PGID, group_leader->pid);
 	if (pid)
 		goto out;
 
-	current->signal->leader = 1;
-	__set_special_pids(current->pid, current->pid);
-	current->signal->tty = NULL;
-	current->signal->tty_old_pgrp = 0;
-	err = process_group(current);
+	group_leader->signal->leader = 1;
+	__set_special_pids(group_leader->pid, group_leader->pid);
+	group_leader->signal->tty = NULL;
+	group_leader->signal->tty_old_pgrp = 0;
+	err = process_group(group_leader);
 out:
 	write_unlock_irq(&tasklist_lock);
 	up(&tty_sem);
-- 
cgit v1.2.2


From f7dd795e913656c390b6bde27790c518973feea1 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 8 Jan 2006 01:03:59 -0800
Subject: [PATCH] setpgid: should not accept ptraced childs

sys_setpgid() allows to change ->pgrp of ptraced childs.

'man setpgid' does not tell anything about that, so I consider
this behaviour is a bug.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Oren Laadan <orenl@cs.columbia.edu>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sys.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sys.c b/kernel/sys.c
index f497bf56ad..218937e837 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1114,7 +1114,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
 	if (!thread_group_leader(p))
 		goto out;
 
-	if (p->parent == current || p->real_parent == group_leader) {
+	if (p->real_parent == group_leader) {
 		err = -EPERM;
 		if (p->signal->session != group_leader->signal->session)
 			goto out;
-- 
cgit v1.2.2


From 37a327957e4b193369854f76afcedaee8d903521 Mon Sep 17 00:00:00 2001
From: Andy Isaacson <adi@hexapodia.org>
Date: Sun, 8 Jan 2006 01:04:00 -0800
Subject: [PATCH] block/stat.txt

I couldn't find any docs explaining the contents of /sys/block/<dev>/stat,
so I wrote up the following.  I'm not completely sure it's accurate - Jens,
could you give a yea or nay on this?

In particular, the counts of read/write IOs and read/write sectors are
incremented in different places - it looks like they both increment as the
request is being finished, but I'm not completely sure of that.

Cc: Jens Axboe <axboe@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/block/stat.txt | 82 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 82 insertions(+)
 create mode 100644 Documentation/block/stat.txt

diff --git a/Documentation/block/stat.txt b/Documentation/block/stat.txt
new file mode 100644
index 0000000000..0dbc946de2
--- /dev/null
+++ b/Documentation/block/stat.txt
@@ -0,0 +1,82 @@
+Block layer statistics in /sys/block/<dev>/stat
+===============================================
+
+This file documents the contents of the /sys/block/<dev>/stat file.
+
+The stat file provides several statistics about the state of block
+device <dev>.
+
+Q. Why are there multiple statistics in a single file?  Doesn't sysfs
+   normally contain a single value per file?
+A. By having a single file, the kernel can guarantee that the statistics
+   represent a consistent snapshot of the state of the device.  If the
+   statistics were exported as multiple files containing one statistic
+   each, it would be impossible to guarantee that a set of readings
+   represent a single point in time.
+
+The stat file consists of a single line of text containing 11 decimal
+values separated by whitespace.  The fields are summarized in the
+following table, and described in more detail below.
+
+Name            units         description
+----            -----         -----------
+read I/Os       requests      number of read I/Os processed
+read merges     requests      number of read I/Os merged with in-queue I/O
+read sectors    sectors       number of sectors read
+read ticks      milliseconds  total wait time for read requests
+write I/Os      requests      number of write I/Os processed
+write merges    requests      number of write I/Os merged with in-queue I/O
+write sectors   sectors       number of sectors written
+write ticks     milliseconds  total wait time for write requests
+in_flight       requests      number of I/Os currently in flight
+io_ticks        milliseconds  total time this block device has been active
+time_in_queue   milliseconds  total wait time for all requests
+
+read I/Os, write I/Os
+=====================
+
+These values increment when an I/O request completes.
+
+read merges, write merges
+=========================
+
+These values increment when an I/O request is merged with an
+already-queued I/O request.
+
+read sectors, write sectors
+===========================
+
+These values count the number of sectors read from or written to this
+block device.  The "sectors" in question are the standard UNIX 512-byte
+sectors, not any device- or filesystem-specific block size.  The
+counters are incremented when the I/O completes.
+
+read ticks, write ticks
+=======================
+
+These values count the number of milliseconds that I/O requests have
+waited on this block device.  If there are multiple I/O requests waiting,
+these values will increase at a rate greater than 1000/second; for
+example, if 60 read requests wait for an average of 30 ms, the read_ticks
+field will increase by 60*30 = 1800.
+
+in_flight
+=========
+
+This value counts the number of I/O requests that have been issued to
+the device driver but have not yet completed.  It does not include I/O
+requests that are in the queue but not yet issued to the device driver.
+
+io_ticks
+========
+
+This value counts the number of milliseconds during which the device has
+had I/O requests queued.
+
+time_in_queue
+=============
+
+This value counts the number of milliseconds that I/O requests have waited
+on this block device.  If there are multiple I/O requests waiting, this
+value will increase as the product of the number of milliseconds times the
+number of requests waiting (see "read ticks" above for an example).
-- 
cgit v1.2.2


From ddc0f846aa7621940b74cee0c91cd26405058a4d Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Sun, 8 Jan 2006 01:04:01 -0800
Subject: [PATCH] fs/udf/balloc.c: "extern inline" -> "static inline"

"extern inline" doesn't make much sense.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/udf/balloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 6598a5037a..4fae57d9d1 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -41,7 +41,7 @@
 #define uint(x) xuint(x)
 #define xuint(x) __le ## x
 
-extern inline int find_next_one_bit (void * addr, int size, int offset)
+static inline int find_next_one_bit (void * addr, int size, int offset)
 {
 	uintBPL_t * p = ((uintBPL_t *) addr) + (offset / BITS_PER_LONG);
 	int result = offset & ~(BITS_PER_LONG-1);
-- 
cgit v1.2.2


From fe7d37d1fbf8ffe78abd72728b24fb0c64f7af55 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 8 Jan 2006 01:04:02 -0800
Subject: [PATCH] copy_process: error path cleanup

This patch moves 'fork_out:' under 'bad_fork_free:', and removes now
unneeded 'if (retval)' check.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index 4bc0bd8ef1..72e3252c67 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1146,11 +1146,6 @@ static task_t *copy_process(unsigned long clone_flags,
 	total_forks++;
 	write_unlock_irq(&tasklist_lock);
 	proc_fork_connector(p);
-	retval = 0;
-
-fork_out:
-	if (retval)
-		return ERR_PTR(retval);
 	return p;
 
 bad_fork_cleanup_namespace:
@@ -1191,7 +1186,8 @@ bad_fork_cleanup_count:
 	free_uid(p->user);
 bad_fork_free:
 	free_task(p);
-	goto fork_out;
+fork_out:
+	return ERR_PTR(retval);
 }
 
 struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
-- 
cgit v1.2.2


From fd285bb54d8a3e99810090ae88cfe8ed77d1da25 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 8 Jan 2006 01:04:07 -0800
Subject: [PATCH] Abandon gcc-2.95.x

There's one scsi driver which doesn't compile due to weird __VA_ARGS__ tricks
and the rather useful scsi/sd.c is currently getting an ICE.  None of the new
SAS code compiles, due to extensive use of anonymous unions.  The V4L guys are
very good at exploiting the gcc-2.95.x macro expansion bug (_why_ does each
driver need to implement its own debug macros?) and various people keep on
sneaking in anonymous unions, which are rather nice.

Plus anonymous unions are rather useful.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/compiler-gcc2.h | 29 -----------------------------
 include/linux/compiler.h      |  2 --
 init/main.c                   |  7 +------
 3 files changed, 1 insertion(+), 37 deletions(-)
 delete mode 100644 include/linux/compiler-gcc2.h

diff --git a/include/linux/compiler-gcc2.h b/include/linux/compiler-gcc2.h
deleted file mode 100644
index ebed17660c..0000000000
--- a/include/linux/compiler-gcc2.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Never include this file directly.  Include <linux/compiler.h> instead.  */
-
-/* These definitions are for GCC v2.x.  */
-
-/* Somewhere in the middle of the GCC 2.96 development cycle, we implemented
-   a mechanism by which the user can annotate likely branch directions and
-   expect the blocks to be reordered appropriately.  Define __builtin_expect
-   to nothing for earlier compilers.  */
-#include <linux/compiler-gcc.h>
-
-#if __GNUC_MINOR__ < 96
-# define __builtin_expect(x, expected_value) (x)
-#endif
-
-#define __attribute_used__	__attribute__((__unused__))
-
-/*
- * The attribute `pure' is not implemented in GCC versions earlier
- * than 2.96.
- */
-#if __GNUC_MINOR__ >= 96
-# define __attribute_pure__	__attribute__((pure))
-# define __attribute_const__	__attribute__((__const__))
-#endif
-
-/* GCC 2.95.x/2.96 recognize __va_copy, but not va_copy. Actually later GCC's
- * define both va_copy and __va_copy, but the latter may go away, so limit this
- * to this header */
-#define va_copy			__va_copy
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index d7378215b8..f23d3c6fc2 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -42,8 +42,6 @@ extern void __chk_io_ptr(void __iomem *);
 # include <linux/compiler-gcc4.h>
 #elif __GNUC__ == 3
 # include <linux/compiler-gcc3.h>
-#elif __GNUC__ == 2
-# include <linux/compiler-gcc2.h>
 #else
 # error Sorry, your compiler is too old/not recognized.
 #endif
diff --git a/init/main.c b/init/main.c
index afe5eb84ad..8342c2890b 100644
--- a/init/main.c
+++ b/init/main.c
@@ -58,11 +58,6 @@
  * This is one of the first .c files built. Error out early
  * if we have compiler trouble..
  */
-#if __GNUC__ == 2 && __GNUC_MINOR__ == 96
-#ifdef CONFIG_FRAME_POINTER
-#error This compiler cannot compile correctly with frame pointers enabled
-#endif
-#endif
 
 #ifdef CONFIG_X86_LOCAL_APIC
 #include <asm/smp.h>
@@ -74,7 +69,7 @@
  * To avoid associated bogus bug reports, we flatly refuse to compile
  * with a gcc that is known to be too old from the very beginning.
  */
-#if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 95)
+#if (__GNUC__ < 3) || (__GNUC__ == 3 && __GNUC_MINOR__ < 2)
 #error Sorry, your GCC is too old. It builds incorrect kernels.
 #endif
 
-- 
cgit v1.2.2


From a1365647022eb05a5993f270a78e9bef3bf554eb Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 8 Jan 2006 01:04:09 -0800
Subject: [PATCH] remove gcc-2 checks

Remove various things which were checking for gcc-1.x and gcc-2.x compilers.

From: Adrian Bunk <bunk@stusta.de>

    Some documentation updates and removes some code paths for gcc < 3.2.

Acked-by: Russell King <rmk+kernel@arm.linux.org.uk>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/Changes                 | 31 +++++--------------------------
 README                                |  7 ++-----
 arch/arm/kernel/asm-offsets.c         |  9 ++-------
 arch/arm26/kernel/asm-offsets.c       |  7 -------
 arch/i386/Kconfig                     |  4 ----
 arch/i386/Makefile                    |  5 +----
 arch/i386/Makefile.cpu                | 10 +++++-----
 arch/ia64/Makefile                    |  4 ----
 arch/ia64/kernel/head.S               |  2 +-
 arch/ia64/kernel/ia64_ksyms.c         |  2 +-
 arch/ia64/oprofile/backtrace.c        |  2 +-
 drivers/md/raid0.c                    |  6 ------
 drivers/media/video/v4l2-common.c     |  2 --
 fs/ocfs2/cluster/masklog.h            |  7 +++----
 fs/xfs/xfs_log.h                      |  8 +-------
 include/asm-alpha/compiler.h          |  2 --
 include/asm-alpha/processor.h         | 21 ---------------------
 include/asm-ia64/bug.h                |  6 +-----
 include/asm-ia64/spinlock.h           |  2 +-
 include/asm-sparc64/system.h          |  4 ----
 include/asm-um/rwsem.h                |  4 ----
 include/asm-v850/unistd.h             | 18 ------------------
 include/linux/byteorder/generic.h     |  2 +-
 include/linux/byteorder/swab.h        |  2 +-
 include/linux/byteorder/swabb.h       |  2 +-
 include/linux/compiler-gcc.h          |  9 +++++++++
 include/linux/compiler-gcc3.h         | 17 -----------------
 include/linux/compiler-gcc4.h         |  7 -------
 include/linux/kernel.h                |  2 --
 include/linux/seccomp.h               |  6 +-----
 include/linux/spinlock_types_up.h     | 14 --------------
 sound/isa/wavefront/wavefront_synth.c |  7 -------
 32 files changed, 37 insertions(+), 194 deletions(-)

diff --git a/Documentation/Changes b/Documentation/Changes
index 86b86399d6..fe5ae0f550 100644
--- a/Documentation/Changes
+++ b/Documentation/Changes
@@ -31,8 +31,6 @@ al espa
 Eine deutsche Version dieser Datei finden Sie unter
 <http://www.stefan-winter.de/Changes-2.4.0.txt>.
 
-Last updated: October 29th, 2002
-
 Chris Ricker (kaboom@gatech.edu or chris.ricker@genetics.utah.edu).
 
 Current Minimal Requirements
@@ -48,7 +46,7 @@ necessary on all systems; obviously, if you don't have any ISDN
 hardware, for example, you probably needn't concern yourself with
 isdn4k-utils.
 
-o  Gnu C                  2.95.3                  # gcc --version
+o  Gnu C                  3.2                     # gcc --version
 o  Gnu make               3.79.1                  # make --version
 o  binutils               2.12                    # ld -v
 o  util-linux             2.10o                   # fdformat --version
@@ -74,26 +72,7 @@ GCC
 ---
 
 The gcc version requirements may vary depending on the type of CPU in your
-computer. The next paragraph applies to users of x86 CPUs, but not
-necessarily to users of other CPUs. Users of other CPUs should obtain
-information about their gcc version requirements from another source.
-
-The recommended compiler for the kernel is gcc 2.95.x (x >= 3), and it
-should be used when you need absolute stability. You may use gcc 3.0.x
-instead if you wish, although it may cause problems. Later versions of gcc 
-have not received much testing for Linux kernel compilation, and there are 
-almost certainly bugs (mainly, but not exclusively, in the kernel) that
-will need to be fixed in order to use these compilers. In any case, using
-pgcc instead of plain gcc is just asking for trouble.
-
-The Red Hat gcc 2.96 compiler subtree can also be used to build this tree.
-You should ensure you use gcc-2.96-74 or later. gcc-2.96-54 will not build
-the kernel correctly.
-
-In addition, please pay attention to compiler optimization.  Anything
-greater than -O2 may not be wise.  Similarly, if you choose to use gcc-2.95.x
-or derivatives, be sure not to use -fstrict-aliasing (which, depending on
-your version of gcc 2.95.x, may necessitate using -fno-strict-aliasing).
+computer.
 
 Make
 ----
@@ -322,9 +301,9 @@ Getting updated software
 Kernel compilation
 ******************
 
-gcc 2.95.3
-----------
-o  <ftp://ftp.gnu.org/gnu/gcc/gcc-2.95.3.tar.gz>
+gcc
+---
+o  <ftp://ftp.gnu.org/gnu/gcc/>
 
 Make
 ----
diff --git a/README b/README
index 61c4f74292..cd5e2eb621 100644
--- a/README
+++ b/README
@@ -183,11 +183,8 @@ CONFIGURING the kernel:
 
 COMPILING the kernel:
 
- - Make sure you have gcc 2.95.3 available.
-   gcc 2.91.66 (egcs-1.1.2), and gcc 2.7.2.3 are known to miscompile
-   some parts of the kernel, and are *no longer supported*.
-   Also remember to upgrade your binutils package (for as/ld/nm and company)
-   if necessary. For more information, refer to Documentation/Changes.
+ - Make sure you have at least gcc 3.2 available.
+   For more information, refer to Documentation/Changes.
 
    Please note that you can still run a.out user programs with this kernel.
 
diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c
index 04d3082a7b..0abbce8c70 100644
--- a/arch/arm/kernel/asm-offsets.c
+++ b/arch/arm/kernel/asm-offsets.c
@@ -23,20 +23,15 @@
 #error Sorry, your compiler targets APCS-26 but this kernel requires APCS-32
 #endif
 /*
- * GCC 2.95.1, 2.95.2: ignores register clobber list in asm().
  * GCC 3.0, 3.1: general bad code generation.
  * GCC 3.2.0: incorrect function argument offset calculation.
  * GCC 3.2.x: miscompiles NEW_AUX_ENT in fs/binfmt_elf.c
  *            (http://gcc.gnu.org/PR8896) and incorrect structure
  *	      initialisation in fs/jffs2/erase.c
  */
-#if __GNUC__ < 2 || \
-   (__GNUC__ == 2 && __GNUC_MINOR__ < 95) || \
-   (__GNUC__ == 2 && __GNUC_MINOR__ == 95 && __GNUC_PATCHLEVEL__ != 0 && \
-					     __GNUC_PATCHLEVEL__ < 3) || \
-   (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
+#if (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
 #error Your compiler is too buggy; it is known to miscompile kernels.
-#error    Known good compilers: 2.95.3, 2.95.4, 2.96, 3.3
+#error    Known good compilers: 3.3
 #endif
 
 /* Use marker if you need to separate the values later */
diff --git a/arch/arm26/kernel/asm-offsets.c b/arch/arm26/kernel/asm-offsets.c
index 4ccacaef94..ac682d5fd0 100644
--- a/arch/arm26/kernel/asm-offsets.c
+++ b/arch/arm26/kernel/asm-offsets.c
@@ -25,13 +25,6 @@
 #if defined(__APCS_32__) && defined(CONFIG_CPU_26)
 #error Sorry, your compiler targets APCS-32 but this kernel requires APCS-26
 #endif
-#if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 95)
-#error Sorry, your compiler is known to miscompile kernels.  Only use gcc 2.95.3 and later.
-#endif
-#if __GNUC__ == 2 && __GNUC_MINOR__ == 95
-/* shame we can't detect the .1 or .2 releases */
-#warning GCC 2.95.2 and earlier miscompiles kernels.
-#endif
 
 /* Use marker if you need to separate the values later */
 
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 968fabd872..486449e9e7 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -630,10 +630,6 @@ config REGPARM
 	and passes the first three arguments of a function call in registers.
 	This will probably break binary only modules.
 
-	This feature is only enabled for gcc-3.0 and later - earlier compilers
-	generate incorrect output with certain kernel constructs when
-	-mregparm=3 is used.
-
 config SECCOMP
 	bool "Enable seccomp to safely compute untrusted bytecode"
 	depends on PROC_FS
diff --git a/arch/i386/Makefile b/arch/i386/Makefile
index d121ea1846..b84119f9cc 100644
--- a/arch/i386/Makefile
+++ b/arch/i386/Makefile
@@ -37,10 +37,7 @@ CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2)
 # CPU-specific tuning. Anything which can be shared with UML should go here.
 include $(srctree)/arch/i386/Makefile.cpu
 
-# -mregparm=3 works ok on gcc-3.0 and later
-#
-GCC_VERSION			:= $(call cc-version)
-cflags-$(CONFIG_REGPARM) 	+= $(shell if [ $(GCC_VERSION) -ge 0300 ] ; then echo "-mregparm=3"; fi ;)
+cflags-$(CONFIG_REGPARM) 	+= -mregparm=3
 
 # Disable unit-at-a-time mode, it makes gcc use a lot more stack
 # due to the lack of sharing of stacklots.
diff --git a/arch/i386/Makefile.cpu b/arch/i386/Makefile.cpu
index 8e51456df2..dcd936ef45 100644
--- a/arch/i386/Makefile.cpu
+++ b/arch/i386/Makefile.cpu
@@ -1,7 +1,7 @@
 # CPU tuning section - shared with UML.
 # Must change only cflags-y (or [yn]), not CFLAGS! That makes a difference for UML.
 
-#-mtune exists since gcc 3.4, and some -mcpu flavors didn't exist in gcc 2.95.
+#-mtune exists since gcc 3.4
 HAS_MTUNE	:= $(call cc-option-yn, -mtune=i386)
 ifeq ($(HAS_MTUNE),y)
 tune		= $(call cc-option,-mtune=$(1),)
@@ -14,7 +14,7 @@ cflags-$(CONFIG_M386)		+= -march=i386
 cflags-$(CONFIG_M486)		+= -march=i486
 cflags-$(CONFIG_M586)		+= -march=i586
 cflags-$(CONFIG_M586TSC)	+= -march=i586
-cflags-$(CONFIG_M586MMX)	+= $(call cc-option,-march=pentium-mmx,-march=i586)
+cflags-$(CONFIG_M586MMX)	+= -march=pentium-mmx
 cflags-$(CONFIG_M686)		+= -march=i686
 cflags-$(CONFIG_MPENTIUMII)	+= -march=i686 $(call tune,pentium2)
 cflags-$(CONFIG_MPENTIUMIII)	+= -march=i686 $(call tune,pentium3)
@@ -23,8 +23,8 @@ cflags-$(CONFIG_MPENTIUM4)	+= -march=i686 $(call tune,pentium4)
 cflags-$(CONFIG_MK6)		+= -march=k6
 # Please note, that patches that add -march=athlon-xp and friends are pointless.
 # They make zero difference whatsosever to performance at this time.
-cflags-$(CONFIG_MK7)		+= $(call cc-option,-march=athlon,-march=i686 $(align)-functions=4)
-cflags-$(CONFIG_MK8)		+= $(call cc-option,-march=k8,$(call cc-option,-march=athlon,-march=i686 $(align)-functions=4))
+cflags-$(CONFIG_MK7)		+= -march=athlon
+cflags-$(CONFIG_MK8)		+= $(call cc-option,-march=k8,-march=athlon)
 cflags-$(CONFIG_MCRUSOE)	+= -march=i686 $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0
 cflags-$(CONFIG_MEFFICEON)	+= -march=i686 $(call tune,pentium3) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0
 cflags-$(CONFIG_MWINCHIPC6)	+= $(call cc-option,-march=winchip-c6,-march=i586)
@@ -37,5 +37,5 @@ cflags-$(CONFIG_MVIAC3_2)	+= $(call cc-option,-march=c3-2,-march=i686)
 cflags-$(CONFIG_X86_ELAN)	+= -march=i486
 
 # Geode GX1 support
-cflags-$(CONFIG_MGEODEGX1)		+= $(call cc-option,-march=pentium-mmx,-march=i486)
+cflags-$(CONFIG_MGEODEGX1)	+= -march=pentium-mmx
 
diff --git a/arch/ia64/Makefile b/arch/ia64/Makefile
index 67932ad530..57b047c27e 100644
--- a/arch/ia64/Makefile
+++ b/arch/ia64/Makefile
@@ -37,10 +37,6 @@ $(error Sorry, you need a newer version of the assember, one that is built from
 		ftp://ftp.hpl.hp.com/pub/linux-ia64/gas-030124.tar.gz)
 endif
 
-ifneq ($(shell if [ $(GCC_VERSION) -lt 0300 ] ; then echo "bad"; fi ;),)
-$(error Sorry, your compiler is too old.  GCC v2.96 is known to generate bad code.)
-endif
-
 ifeq ($(GCC_VERSION),0304)
 	cflags-$(CONFIG_ITANIUM)	+= -mtune=merced
 	cflags-$(CONFIG_MCKINLEY)	+= -mtune=mckinley
diff --git a/arch/ia64/kernel/head.S b/arch/ia64/kernel/head.S
index bfe65b2e86..fbc7ea35dd 100644
--- a/arch/ia64/kernel/head.S
+++ b/arch/ia64/kernel/head.S
@@ -1060,7 +1060,7 @@ SET_REG(b5);
 	 * the clobber lists for spin_lock() in include/asm-ia64/spinlock.h.
 	 */
 
-#if __GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
+#if (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
 
 GLOBAL_ENTRY(ia64_spinlock_contention_pre3_4)
 	.prologue
diff --git a/arch/ia64/kernel/ia64_ksyms.c b/arch/ia64/kernel/ia64_ksyms.c
index 5db9d3bcbb..e72de580eb 100644
--- a/arch/ia64/kernel/ia64_ksyms.c
+++ b/arch/ia64/kernel/ia64_ksyms.c
@@ -103,7 +103,7 @@ EXPORT_SYMBOL(unw_init_running);
 
 #ifdef ASM_SUPPORTED
 # ifdef CONFIG_SMP
-#  if __GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
+#  if (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
 /*
  * This is not a normal routine and we don't want a function descriptor for it, so we use
  * a fake declaration here.
diff --git a/arch/ia64/oprofile/backtrace.c b/arch/ia64/oprofile/backtrace.c
index b7dabbfb0d..adb01566bd 100644
--- a/arch/ia64/oprofile/backtrace.c
+++ b/arch/ia64/oprofile/backtrace.c
@@ -32,7 +32,7 @@ typedef struct
 	u64 *prev_pfs_loc;	/* state for WAR for old spinlock ool code */
 } ia64_backtrace_t;
 
-#if __GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
+#if (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
 /*
  * Returns non-zero if the PC is in the spinlock contention out-of-line code
  * with non-standard calling sequence (on older compilers).
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index abbca15020..d03f99cf4b 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -306,9 +306,6 @@ static int raid0_run (mddev_t *mddev)
 	printk("raid0 : conf->hash_spacing is %llu blocks.\n",
 		(unsigned long long)conf->hash_spacing);
 	{
-#if __GNUC__ < 3
-		volatile
-#endif
 		sector_t s = mddev->array_size;
 		sector_t space = conf->hash_spacing;
 		int round;
@@ -439,9 +436,6 @@ static int raid0_make_request (request_queue_t *q, struct bio *bio)
  
 
 	{
-#if __GNUC__ < 3
-		volatile
-#endif
 		sector_t x = block >> conf->preshift;
 		sector_div(x, (u32)conf->hash_spacing);
 		zone = conf->hash_table[x];
diff --git a/drivers/media/video/v4l2-common.c b/drivers/media/video/v4l2-common.c
index 597b8db35a..62a7d636ef 100644
--- a/drivers/media/video/v4l2-common.c
+++ b/drivers/media/video/v4l2-common.c
@@ -191,9 +191,7 @@ char *v4l2_type_names[] = {
 };
 
 char *v4l2_ioctl_names[256] = {
-#if __GNUC__ >= 3
 	[0 ... 255]                      = "UNKNOWN",
-#endif
 	[_IOC_NR(VIDIOC_QUERYCAP)]       = "VIDIOC_QUERYCAP",
 	[_IOC_NR(VIDIOC_RESERVED)]       = "VIDIOC_RESERVED",
 	[_IOC_NR(VIDIOC_ENUM_FMT)]       = "VIDIOC_ENUM_FMT",
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index f5ef5ea61a..e8c56a3d9c 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -212,11 +212,10 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
 	mlog(ML_ENTRY, "ENTRY:\n");					\
 } while (0)
 
-/* We disable this for old compilers since they don't have support for
- * __builtin_types_compatible_p.
+/*
+ * We disable this for sparse.
  */
-#if (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)) && \
-    !defined(__CHECKER__)
+#if !defined(__CHECKER__)
 #define mlog_exit(st) do {						     \
 	if (__builtin_types_compatible_p(typeof(st), unsigned long))	     \
 		mlog(ML_EXIT, "EXIT: %lu\n", (unsigned long) (st));	     \
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 158829ca56..f40d4391fc 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -30,13 +30,7 @@
  * By comparing each compnent, we don't have to worry about extra
  * endian issues in treating two 32 bit numbers as one 64 bit number
  */
-static
-#if defined(__GNUC__) && (__GNUC__ == 2) && ( (__GNUC_MINOR__ == 95) || (__GNUC_MINOR__ == 96))
-__attribute__((unused))	/* gcc 2.95, 2.96 miscompile this when inlined */
-#else
-__inline__
-#endif
-xfs_lsn_t	_lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
+static inline xfs_lsn_t	_lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
 {
 	if (CYCLE_LSN(lsn1) != CYCLE_LSN(lsn2))
 		return (CYCLE_LSN(lsn1)<CYCLE_LSN(lsn2))? -999 : 999;
diff --git a/include/asm-alpha/compiler.h b/include/asm-alpha/compiler.h
index 0a4a8b40df..00c6f57ad9 100644
--- a/include/asm-alpha/compiler.h
+++ b/include/asm-alpha/compiler.h
@@ -98,9 +98,7 @@
 #undef inline
 #undef __inline__
 #undef __inline
-#if __GNUC__ == 3 && __GNUC_MINOR__ >= 1 || __GNUC__ > 3
 #undef __always_inline
 #define __always_inline		inline __attribute__((always_inline))
-#endif
 
 #endif /* __ALPHA_COMPILER_H */
diff --git a/include/asm-alpha/processor.h b/include/asm-alpha/processor.h
index 059780a7d3..bb1a7a3abb 100644
--- a/include/asm-alpha/processor.h
+++ b/include/asm-alpha/processor.h
@@ -77,7 +77,6 @@ unsigned long get_wchan(struct task_struct *p);
 #define spin_lock_prefetch(lock)  	do { } while (0)
 #endif
 
-#if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)
 extern inline void prefetch(const void *ptr)  
 { 
 	__builtin_prefetch(ptr, 0, 3);
@@ -95,24 +94,4 @@ extern inline void spin_lock_prefetch(const void *ptr)
 }
 #endif
 
-#else
-extern inline void prefetch(const void *ptr)  
-{ 
-	__asm__ ("ldl $31,%0" : : "m"(*(char *)ptr)); 
-}
-
-extern inline void prefetchw(const void *ptr)  
-{
-	__asm__ ("ldq $31,%0" : : "m"(*(char *)ptr)); 
-}
-
-#ifdef CONFIG_SMP
-extern inline void spin_lock_prefetch(const void *ptr)  
-{
-	__asm__ ("ldq $31,%0" : : "m"(*(char *)ptr)); 
-}
-#endif
-
-#endif /* GCC 3.1 */
-
 #endif /* __ASM_ALPHA_PROCESSOR_H */
diff --git a/include/asm-ia64/bug.h b/include/asm-ia64/bug.h
index 3aa0a0a547..823616b502 100644
--- a/include/asm-ia64/bug.h
+++ b/include/asm-ia64/bug.h
@@ -2,11 +2,7 @@
 #define _ASM_IA64_BUG_H
 
 #ifdef CONFIG_BUG
-#if (__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)
-# define ia64_abort()	__builtin_trap()
-#else
-# define ia64_abort()	(*(volatile int *) 0 = 0)
-#endif
+#define ia64_abort()	__builtin_trap()
 #define BUG() do { printk("kernel BUG at %s:%d!\n", __FILE__, __LINE__); ia64_abort(); } while (0)
 
 /* should this BUG be made generic? */
diff --git a/include/asm-ia64/spinlock.h b/include/asm-ia64/spinlock.h
index 0c91a76c5e..9e83210dc3 100644
--- a/include/asm-ia64/spinlock.h
+++ b/include/asm-ia64/spinlock.h
@@ -34,7 +34,7 @@ __raw_spin_lock_flags (raw_spinlock_t *lock, unsigned long flags)
 {
 	register volatile unsigned int *ptr asm ("r31") = &lock->lock;
 
-#if __GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
+#if (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
 # ifdef CONFIG_ITANIUM
 	/* don't use brl on Itanium... */
 	asm volatile ("{\n\t"
diff --git a/include/asm-sparc64/system.h b/include/asm-sparc64/system.h
index b5417529f6..309f1466b6 100644
--- a/include/asm-sparc64/system.h
+++ b/include/asm-sparc64/system.h
@@ -193,11 +193,7 @@ do {						\
 	 * not preserve it's value.  Hairy, but it lets us remove 2 loads
 	 * and 2 stores in this critical code path.  -DaveM
 	 */
-#if __GNUC__ >= 3
 #define EXTRA_CLOBBER ,"%l1"
-#else
-#define EXTRA_CLOBBER
-#endif
 #define switch_to(prev, next, last)					\
 do {	if (test_thread_flag(TIF_PERFCTR)) {				\
 		unsigned long __tmp;					\
diff --git a/include/asm-um/rwsem.h b/include/asm-um/rwsem.h
index 661c0e5470..b5fc449dc8 100644
--- a/include/asm-um/rwsem.h
+++ b/include/asm-um/rwsem.h
@@ -1,10 +1,6 @@
 #ifndef __UM_RWSEM_H__
 #define __UM_RWSEM_H__
 
-#if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
-#define __builtin_expect(exp,c) (exp)
-#endif
-
 #include "asm/arch/rwsem.h"
 
 #endif
diff --git a/include/asm-v850/unistd.h b/include/asm-v850/unistd.h
index 5a86f8e976..82460a7bb2 100644
--- a/include/asm-v850/unistd.h
+++ b/include/asm-v850/unistd.h
@@ -241,9 +241,6 @@
 /* User programs sometimes end up including this header file
    (indirectly, via uClibc header files), so I'm a bit nervous just
    including <linux/compiler.h>.  */
-#if !defined(__builtin_expect) && __GNUC__ == 2 && __GNUC_MINOR__ < 96
-#define __builtin_expect(x, expected_value) (x)
-#endif
 
 #define __syscall_return(type, res)					      \
   do {									      \
@@ -346,20 +343,6 @@ type name (atype a, btype b, ctype c, dtype d, etype e)			      \
   __syscall_return (type, __ret);					      \
 }
 
-#if __GNUC__ < 3
-/* In older versions of gcc, `asm' statements with more than 10
-   input/output arguments produce a fatal error.  To work around this
-   problem, we use two versions, one for gcc-3.x and one for earlier
-   versions of gcc (the `earlier gcc' version doesn't work with gcc-3.x
-   because gcc-3.x doesn't allow clobbers to also be input arguments).  */
-#define __SYSCALL6_TRAP(syscall, ret, a, b, c, d, e, f)			      \
-  __asm__ __volatile__ ("trap " SYSCALL_LONG_TRAP			      \
-			: "=r" (ret), "=r" (syscall)			      \
-			: "1" (syscall),				      \
-			"r" (a), "r" (b), "r" (c), "r" (d),		      \
- 			"r" (e), "r" (f)				      \
-			: SYSCALL_CLOBBERS, SYSCALL_ARG4, SYSCALL_ARG5);
-#else /* __GNUC__ >= 3 */
 #define __SYSCALL6_TRAP(syscall, ret, a, b, c, d, e, f)			      \
   __asm__ __volatile__ ("trap " SYSCALL_LONG_TRAP			      \
 			: "=r" (ret), "=r" (syscall),			      \
@@ -368,7 +351,6 @@ type name (atype a, btype b, ctype c, dtype d, etype e)			      \
 			"r" (a), "r" (b), "r" (c), "r" (d),		      \
 			"2" (e), "3" (f)				      \
 			: SYSCALL_CLOBBERS);
-#endif
 
 #define _syscall6(type, name, atype, a, btype, b, ctype, c, dtype, d, etype, e, ftype, f) \
 type name (atype a, btype b, ctype c, dtype d, etype e, ftype f)	      \
diff --git a/include/linux/byteorder/generic.h b/include/linux/byteorder/generic.h
index 04bd756efc..e86e4a9383 100644
--- a/include/linux/byteorder/generic.h
+++ b/include/linux/byteorder/generic.h
@@ -156,7 +156,7 @@ extern __be32			htonl(__u32);
 extern __u16			ntohs(__be16);
 extern __be16			htons(__u16);
 
-#if defined(__GNUC__) && (__GNUC__ >= 2) && defined(__OPTIMIZE__)
+#if defined(__GNUC__) && defined(__OPTIMIZE__)
 
 #define ___htonl(x) __cpu_to_be32(x)
 #define ___htons(x) __cpu_to_be16(x)
diff --git a/include/linux/byteorder/swab.h b/include/linux/byteorder/swab.h
index 2f1cb77512..25f7f32883 100644
--- a/include/linux/byteorder/swab.h
+++ b/include/linux/byteorder/swab.h
@@ -110,7 +110,7 @@
 /*
  * Allow constant folding
  */
-#if defined(__GNUC__) && (__GNUC__ >= 2) && defined(__OPTIMIZE__)
+#if defined(__GNUC__) && defined(__OPTIMIZE__)
 #  define __swab16(x) \
 (__builtin_constant_p((__u16)(x)) ? \
  ___swab16((x)) : \
diff --git a/include/linux/byteorder/swabb.h b/include/linux/byteorder/swabb.h
index d5f2a32051..ae5e5f914b 100644
--- a/include/linux/byteorder/swabb.h
+++ b/include/linux/byteorder/swabb.h
@@ -77,7 +77,7 @@
 /*
  * Allow constant folding
  */
-#if defined(__GNUC__) && (__GNUC__ >= 2) && defined(__OPTIMIZE__)
+#if defined(__GNUC__) && defined(__OPTIMIZE__)
 #  define __swahw32(x) \
 (__builtin_constant_p((__u32)(x)) ? \
  ___swahw32((x)) : \
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 1527340554..2e05e1e6b0 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -15,3 +15,12 @@
   ({ unsigned long __ptr;					\
     __asm__ ("" : "=g"(__ptr) : "0"(ptr));		\
     (typeof(ptr)) (__ptr + (off)); })
+
+
+#define inline		inline		__attribute__((always_inline))
+#define __inline__	__inline__	__attribute__((always_inline))
+#define __inline	__inline	__attribute__((always_inline))
+#define __deprecated			__attribute__((deprecated))
+#define  noinline			__attribute__((noinline))
+#define __attribute_pure__		__attribute__((pure))
+#define __attribute_const__		__attribute__((__const__))
diff --git a/include/linux/compiler-gcc3.h b/include/linux/compiler-gcc3.h
index a6fa615afa..4209082ee9 100644
--- a/include/linux/compiler-gcc3.h
+++ b/include/linux/compiler-gcc3.h
@@ -3,29 +3,12 @@
 /* These definitions are for GCC v3.x.  */
 #include <linux/compiler-gcc.h>
 
-#if __GNUC_MINOR__ >= 1
-# define inline		inline		__attribute__((always_inline))
-# define __inline__	__inline__	__attribute__((always_inline))
-# define __inline	__inline	__attribute__((always_inline))
-#endif
-
-#if __GNUC_MINOR__ > 0
-# define __deprecated		__attribute__((deprecated))
-#endif
-
 #if __GNUC_MINOR__ >= 3
 # define __attribute_used__	__attribute__((__used__))
 #else
 # define __attribute_used__	__attribute__((__unused__))
 #endif
 
-#define __attribute_pure__	__attribute__((pure))
-#define __attribute_const__	__attribute__((__const__))
-
-#if __GNUC_MINOR__ >= 1
-#define  noinline		__attribute__((noinline))
-#endif
-
 #if __GNUC_MINOR__ >= 4
 #define __must_check		__attribute__((warn_unused_result))
 #endif
diff --git a/include/linux/compiler-gcc4.h b/include/linux/compiler-gcc4.h
index 53686c037a..e913e9beaf 100644
--- a/include/linux/compiler-gcc4.h
+++ b/include/linux/compiler-gcc4.h
@@ -3,14 +3,7 @@
 /* These definitions are for GCC v4.x.  */
 #include <linux/compiler-gcc.h>
 
-#define inline			inline		__attribute__((always_inline))
-#define __inline__		__inline__	__attribute__((always_inline))
-#define __inline		__inline	__attribute__((always_inline))
-#define __deprecated		__attribute__((deprecated))
 #define __attribute_used__	__attribute__((__used__))
-#define __attribute_pure__	__attribute__((pure))
-#define __attribute_const__	__attribute__((__const__))
-#define  noinline		__attribute__((noinline))
 #define __must_check 		__attribute__((warn_unused_result))
 #define __compiler_offsetof(a,b) __builtin_offsetof(a,b)
 
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index b1e407a4fb..ca7ff8fdd0 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -316,8 +316,6 @@ extern int randomize_va_space;
 #endif
 
 /* Trap pasters of __FUNCTION__ at compile-time */
-#if __GNUC__ > 2 || __GNUC_MINOR__ >= 95
 #define __FUNCTION__ (__func__)
-#endif
 
 #endif
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index dc89116bb1..cd2773b29a 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -26,11 +26,7 @@ static inline int has_secure_computing(struct thread_info *ti)
 
 #else /* CONFIG_SECCOMP */
 
-#if (__GNUC__ > 2)
-  typedef struct { } seccomp_t;
-#else
-  typedef struct { int gcc_is_buggy; } seccomp_t;
-#endif
+typedef struct { } seccomp_t;
 
 #define secure_computing(x) do { } while (0)
 /* static inline to preserve typechecking */
diff --git a/include/linux/spinlock_types_up.h b/include/linux/spinlock_types_up.h
index def2d173a8..04135b0e19 100644
--- a/include/linux/spinlock_types_up.h
+++ b/include/linux/spinlock_types_up.h
@@ -22,30 +22,16 @@ typedef struct {
 
 #else
 
-/*
- * All gcc 2.95 versions and early versions of 2.96 have a nasty bug
- * with empty initializers.
- */
-#if (__GNUC__ > 2)
 typedef struct { } raw_spinlock_t;
 
 #define __RAW_SPIN_LOCK_UNLOCKED { }
-#else
-typedef struct { int gcc_is_buggy; } raw_spinlock_t;
-#define __RAW_SPIN_LOCK_UNLOCKED (raw_spinlock_t) { 0 }
-#endif
 
 #endif
 
-#if (__GNUC__ > 2)
 typedef struct {
 	/* no debug version on UP */
 } raw_rwlock_t;
 
 #define __RAW_RW_LOCK_UNLOCKED { }
-#else
-typedef struct { int gcc_is_buggy; } raw_rwlock_t;
-#define __RAW_RW_LOCK_UNLOCKED (raw_rwlock_t) { 0 }
-#endif
 
 #endif /* __LINUX_SPINLOCK_TYPES_UP_H */
diff --git a/sound/isa/wavefront/wavefront_synth.c b/sound/isa/wavefront/wavefront_synth.c
index 679d0ae97e..ed81eec6e7 100644
--- a/sound/isa/wavefront/wavefront_synth.c
+++ b/sound/isa/wavefront/wavefront_synth.c
@@ -115,18 +115,11 @@ MODULE_PARM_DESC(osrun_time, "how many seconds to wait for the ICS2115 OS");
 
 #ifdef WF_DEBUG
 
-#if defined(NEW_MACRO_VARARGS) || __GNUC__ >= 3
 #define DPRINT(cond, ...) \
        if ((dev->debug & (cond)) == (cond)) { \
 	     snd_printk (__VA_ARGS__); \
        }
 #else
-#define DPRINT(cond, args...) \
-       if ((dev->debug & (cond)) == (cond)) { \
-	     snd_printk (args); \
-       }
-#endif
-#else
 #define DPRINT(cond, args...)
 #endif /* WF_DEBUG */
 
-- 
cgit v1.2.2


From 44ac8413901167589226abf824d994aa57e4fd28 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bjorn.helgaas@hp.com>
Date: Sun, 8 Jan 2006 01:04:10 -0800
Subject: [PATCH] /dev/mem __HAVE_PHYS_MEM_ACCESS_PROT tidy-up

Tidy up __HAVE_PHYS_MEM_ACCESS_PROT usage to make mmap_mem() easier to read.

Signed-off-by: Bjorn Helgaas <bjorn.helgaas@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/mem.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index a85f3a3614..ce3ff86411 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -228,20 +228,25 @@ static ssize_t write_mem(struct file * file, const char __user * buf,
 	return written;
 }
 
+#ifndef __HAVE_PHYS_MEM_ACCESS_PROT
+static pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
+				     unsigned long size, pgprot_t vma_prot)
+{
+#ifdef pgprot_noncached
+	unsigned long offset = pfn << PAGE_SHIFT;
+
+	if (uncached_access(file, offset))
+		return pgprot_noncached(vma_prot);
+#endif
+	return vma_prot;
+}
+#endif
+
 static int mmap_mem(struct file * file, struct vm_area_struct * vma)
 {
-#if defined(__HAVE_PHYS_MEM_ACCESS_PROT)
 	vma->vm_page_prot = phys_mem_access_prot(file, vma->vm_pgoff,
 						 vma->vm_end - vma->vm_start,
 						 vma->vm_page_prot);
-#elif defined(pgprot_noncached)
-	unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
-	int uncached;
-
-	uncached = uncached_access(file, offset);
-	if (uncached)
-		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-#endif
 
 	/* Remap-pfn-range will mark the range VM_IO and VM_RESERVED */
 	if (remap_pfn_range(vma,
-- 
cgit v1.2.2


From 80851ef2a5a404e6054211ca96ecd5ac4b06d297 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bjorn.helgaas@hp.com>
Date: Sun, 8 Jan 2006 01:04:13 -0800
Subject: [PATCH] /dev/mem: validate mmap requests

Add a hook so architectures can validate /dev/mem mmap requests.

This is analogous to validation we already perform in the read/write
paths.

The identity mapping scheme used on ia64 requires that each 16MB or
64MB granule be accessed with exactly one attribute (write-back or
uncacheable).  This avoids "attribute aliasing", which can cause a
machine check.

Sample problem scenario:
  - Machine supports VGA, so it has uncacheable (UC) MMIO at 640K-768K
  - efi_memmap_init() discards any write-back (WB) memory in the first granule
  - Application (e.g., "hwinfo") mmaps /dev/mem, offset 0
  - hwinfo receives UC mapping (the default, since memmap says "no WB here")
  - Machine check abort (on chipsets that don't support UC access to WB
    memory, e.g., sx1000)

In the scenario above, the only choices are
  - Use WB for hwinfo mmap.  Can't do this because it causes attribute
    aliasing with the UC mapping for the VGA MMIO space.
  - Use UC for hwinfo mmap.  Can't do this because the chipset may not
    support UC for that region.
  - Disallow the hwinfo mmap with -EINVAL.  That's what this patch does.

Signed-off-by: Bjorn Helgaas <bjorn.helgaas@hp.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/kernel/efi.c | 160 ++++++++++++++++++++++++++++++++++---------------
 drivers/char/mem.c     |  14 ++++-
 include/asm-ia64/io.h  |   1 +
 3 files changed, 124 insertions(+), 51 deletions(-)

diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c
index a3aa45cbcf..c485a3b32b 100644
--- a/arch/ia64/kernel/efi.c
+++ b/arch/ia64/kernel/efi.c
@@ -247,6 +247,32 @@ typedef struct kern_memdesc {
 
 static kern_memdesc_t *kern_memmap;
 
+#define efi_md_size(md)	(md->num_pages << EFI_PAGE_SHIFT)
+
+static inline u64
+kmd_end(kern_memdesc_t *kmd)
+{
+	return (kmd->start + (kmd->num_pages << EFI_PAGE_SHIFT));
+}
+
+static inline u64
+efi_md_end(efi_memory_desc_t *md)
+{
+	return (md->phys_addr + efi_md_size(md));
+}
+
+static inline int
+efi_wb(efi_memory_desc_t *md)
+{
+	return (md->attribute & EFI_MEMORY_WB);
+}
+
+static inline int
+efi_uc(efi_memory_desc_t *md)
+{
+	return (md->attribute & EFI_MEMORY_UC);
+}
+
 static void
 walk (efi_freemem_callback_t callback, void *arg, u64 attr)
 {
@@ -595,8 +621,8 @@ efi_get_iobase (void)
 	return 0;
 }
 
-u32
-efi_mem_type (unsigned long phys_addr)
+static efi_memory_desc_t *
+efi_memory_descriptor (unsigned long phys_addr)
 {
 	void *efi_map_start, *efi_map_end, *p;
 	efi_memory_desc_t *md;
@@ -610,13 +636,13 @@ efi_mem_type (unsigned long phys_addr)
 		md = p;
 
 		if (phys_addr - md->phys_addr < (md->num_pages << EFI_PAGE_SHIFT))
-			 return md->type;
+			 return md;
 	}
 	return 0;
 }
 
-u64
-efi_mem_attributes (unsigned long phys_addr)
+static int
+efi_memmap_has_mmio (void)
 {
 	void *efi_map_start, *efi_map_end, *p;
 	efi_memory_desc_t *md;
@@ -629,36 +655,98 @@ efi_mem_attributes (unsigned long phys_addr)
 	for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
 		md = p;
 
-		if (phys_addr - md->phys_addr < (md->num_pages << EFI_PAGE_SHIFT))
-			return md->attribute;
+		if (md->type == EFI_MEMORY_MAPPED_IO)
+			return 1;
 	}
 	return 0;
 }
+
+u32
+efi_mem_type (unsigned long phys_addr)
+{
+	efi_memory_desc_t *md = efi_memory_descriptor(phys_addr);
+
+	if (md)
+		return md->type;
+	return 0;
+}
+
+u64
+efi_mem_attributes (unsigned long phys_addr)
+{
+	efi_memory_desc_t *md = efi_memory_descriptor(phys_addr);
+
+	if (md)
+		return md->attribute;
+	return 0;
+}
 EXPORT_SYMBOL(efi_mem_attributes);
 
+/*
+ * Determines whether the memory at phys_addr supports the desired
+ * attribute (WB, UC, etc).  If this returns 1, the caller can safely
+ * access *size bytes at phys_addr with the specified attribute.
+ */
+static int
+efi_mem_attribute_range (unsigned long phys_addr, unsigned long *size, u64 attr)
+{
+	efi_memory_desc_t *md = efi_memory_descriptor(phys_addr);
+	unsigned long md_end;
+
+	if (!md || (md->attribute & attr) != attr)
+		return 0;
+
+	do {
+		md_end = efi_md_end(md);
+		if (phys_addr + *size <= md_end)
+			return 1;
+
+		md = efi_memory_descriptor(md_end);
+		if (!md || (md->attribute & attr) != attr) {
+			*size = md_end - phys_addr;
+			return 1;
+		}
+	} while (md);
+	return 0;
+}
+
+/*
+ * For /dev/mem, we only allow read & write system calls to access
+ * write-back memory, because read & write don't allow the user to
+ * control access size.
+ */
 int
 valid_phys_addr_range (unsigned long phys_addr, unsigned long *size)
 {
-	void *efi_map_start, *efi_map_end, *p;
-	efi_memory_desc_t *md;
-	u64 efi_desc_size;
+	return efi_mem_attribute_range(phys_addr, size, EFI_MEMORY_WB);
+}
 
-	efi_map_start = __va(ia64_boot_param->efi_memmap);
-	efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
-	efi_desc_size = ia64_boot_param->efi_memdesc_size;
+/*
+ * We allow mmap of anything in the EFI memory map that supports
+ * either write-back or uncacheable access.  For uncacheable regions,
+ * the supported access sizes are system-dependent, and the user is
+ * responsible for using the correct size.
+ *
+ * Note that this doesn't currently allow access to hot-added memory,
+ * because that doesn't appear in the boot-time EFI memory map.
+ */
+int
+valid_mmap_phys_addr_range (unsigned long phys_addr, unsigned long *size)
+{
+	if (efi_mem_attribute_range(phys_addr, size, EFI_MEMORY_WB))
+		return 1;
 
-	for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
-		md = p;
+	if (efi_mem_attribute_range(phys_addr, size, EFI_MEMORY_UC))
+		return 1;
 
-		if (phys_addr - md->phys_addr < (md->num_pages << EFI_PAGE_SHIFT)) {
-			if (!(md->attribute & EFI_MEMORY_WB))
-				return 0;
+	/*
+	 * Some firmware doesn't report MMIO regions in the EFI memory map.
+	 * The Intel BigSur (a.k.a. HP i2000) has this problem.  In this
+	 * case, we can't use the EFI memory map to validate mmap requests.
+	 */
+	if (!efi_memmap_has_mmio())
+		return 1;
 
-			if (*size > md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - phys_addr)
-				*size = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - phys_addr;
-			return 1;
-		}
-	}
 	return 0;
 }
 
@@ -707,32 +795,6 @@ efi_uart_console_only(void)
 	return 0;
 }
 
-#define efi_md_size(md)	(md->num_pages << EFI_PAGE_SHIFT)
-
-static inline u64
-kmd_end(kern_memdesc_t *kmd)
-{
-	return (kmd->start + (kmd->num_pages << EFI_PAGE_SHIFT));
-}
-
-static inline u64
-efi_md_end(efi_memory_desc_t *md)
-{
-	return (md->phys_addr + efi_md_size(md));
-}
-
-static inline int
-efi_wb(efi_memory_desc_t *md)
-{
-	return (md->attribute & EFI_MEMORY_WB);
-}
-
-static inline int
-efi_uc(efi_memory_desc_t *md)
-{
-	return (md->attribute & EFI_MEMORY_UC);
-}
-
 /*
  * Look for the first granule aligned memory descriptor memory
  * that is big enough to hold EFI memory map. Make sure this
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index ce3ff86411..5b2d180350 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -101,6 +101,11 @@ static inline int valid_phys_addr_range(unsigned long addr, size_t *count)
 
 	return 1;
 }
+
+static inline int valid_mmap_phys_addr_range(unsigned long addr, size_t *size)
+{
+	return 1;
+}
 #endif
 
 /*
@@ -244,15 +249,20 @@ static pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 
 static int mmap_mem(struct file * file, struct vm_area_struct * vma)
 {
+	size_t size = vma->vm_end - vma->vm_start;
+
+	if (!valid_mmap_phys_addr_range(vma->vm_pgoff << PAGE_SHIFT, &size))
+		return -EINVAL;
+
 	vma->vm_page_prot = phys_mem_access_prot(file, vma->vm_pgoff,
-						 vma->vm_end - vma->vm_start,
+						 size,
 						 vma->vm_page_prot);
 
 	/* Remap-pfn-range will mark the range VM_IO and VM_RESERVED */
 	if (remap_pfn_range(vma,
 			    vma->vm_start,
 			    vma->vm_pgoff,
-			    vma->vm_end-vma->vm_start,
+			    size,
 			    vma->vm_page_prot))
 		return -EAGAIN;
 	return 0;
diff --git a/include/asm-ia64/io.h b/include/asm-ia64/io.h
index cf772a67f8..b64fdb9854 100644
--- a/include/asm-ia64/io.h
+++ b/include/asm-ia64/io.h
@@ -89,6 +89,7 @@ phys_to_virt (unsigned long address)
 
 #define ARCH_HAS_VALID_PHYS_ADDR_RANGE
 extern int valid_phys_addr_range (unsigned long addr, size_t *count); /* efi.c */
+extern int valid_mmap_phys_addr_range (unsigned long addr, size_t *count);
 
 /*
  * The following two macros are deprecated and scheduled for removal.
-- 
cgit v1.2.2


From fee781e6c25772db862d3322b4745a896022a4f1 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Sun, 8 Jan 2006 01:04:16 -0800
Subject: [PATCH] fs/proc/: function prototypes belong in header files

Function prototypes belong into header files.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/generic.c  | 2 ++
 fs/proc/inode.c    | 2 +-
 fs/proc/internal.h | 4 ++++
 fs/proc/root.c     | 3 ++-
 4 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 72b431d0a0..20e5c4509a 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -21,6 +21,8 @@
 #include <linux/bitops.h>
 #include <asm/uaccess.h>
 
+#include "internal.h"
+
 static ssize_t proc_file_read(struct file *file, char __user *buf,
 			      size_t nbytes, loff_t *ppos);
 static ssize_t proc_file_write(struct file *file, const char __user *buffer,
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index e6a818a93f..6573f31f1f 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -19,7 +19,7 @@
 #include <asm/system.h>
 #include <asm/uaccess.h>
 
-extern void free_proc_entry(struct proc_dir_entry *);
+#include "internal.h"
 
 static inline struct proc_dir_entry * de_get(struct proc_dir_entry *de)
 {
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 3e55198f98..95a1cf32b8 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -37,6 +37,10 @@ extern int proc_tgid_stat(struct task_struct *, char *);
 extern int proc_pid_status(struct task_struct *, char *);
 extern int proc_pid_statm(struct task_struct *, char *);
 
+void free_proc_entry(struct proc_dir_entry *de);
+
+int proc_init_inodecache(void);
+
 static inline struct task_struct *proc_task(struct inode *inode)
 {
 	return PROC_I(inode)->task;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index aef148f099..68896283c8 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -18,6 +18,8 @@
 #include <linux/bitops.h>
 #include <linux/smp_lock.h>
 
+#include "internal.h"
+
 struct proc_dir_entry *proc_net, *proc_net_stat, *proc_bus, *proc_root_fs, *proc_root_driver;
 
 #ifdef CONFIG_SYSCTL
@@ -36,7 +38,6 @@ static struct file_system_type proc_fs_type = {
 	.kill_sb	= kill_anon_super,
 };
 
-extern int __init proc_init_inodecache(void);
 void __init proc_root_init(void)
 {
 	int err = proc_init_inodecache();
-- 
cgit v1.2.2


From 44f061033360f9d4db7e9b29d64f9df3667cb41e Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dtor_core@ameritech.net>
Date: Sun, 8 Jan 2006 01:04:22 -0800
Subject: [PATCH] Sonypi: convert to the new platform device interface

Do not use platform_device_register_simple() as it is going away, implement
->probe() and -remove() functions so manual binding and unbinding will work
with this driver.

Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
Cc: Stelian Pop <stelian@popies.net>
Cc: Mattia Dongili <malattia@linux.it>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/sonypi.c | 358 +++++++++++++++++++++++++++++---------------------
 1 file changed, 206 insertions(+), 152 deletions(-)

diff --git a/drivers/char/sonypi.c b/drivers/char/sonypi.c
index 51a07370e6..6a9e23dc48 100644
--- a/drivers/char/sonypi.c
+++ b/drivers/char/sonypi.c
@@ -471,7 +471,6 @@ struct sonypi_keypress {
 
 static struct sonypi_device {
 	struct pci_dev *dev;
-	struct platform_device *pdev;
 	u16 irq;
 	u16 bits;
 	u16 ioport1;
@@ -1165,45 +1164,12 @@ static int sonypi_disable(void)
 	return 0;
 }
 
-#ifdef CONFIG_PM
-static int old_camera_power;
-
-static int sonypi_suspend(struct platform_device *dev, pm_message_t state)
-{
-	old_camera_power = sonypi_device.camera_power;
-	sonypi_disable();
-
-	return 0;
-}
-
-static int sonypi_resume(struct platform_device *dev)
-{
-	sonypi_enable(old_camera_power);
-	return 0;
-}
-#endif
-
-static void sonypi_shutdown(struct platform_device *dev)
-{
-	sonypi_disable();
-}
-
-static struct platform_driver sonypi_driver = {
-#ifdef CONFIG_PM
-	.suspend	= sonypi_suspend,
-	.resume		= sonypi_resume,
-#endif
-	.shutdown	= sonypi_shutdown,
-	.driver		= {
-		.name	= "sonypi",
-	},
-};
-
 static int __devinit sonypi_create_input_devices(void)
 {
 	struct input_dev *jog_dev;
 	struct input_dev *key_dev;
 	int i;
+	int error;
 
 	sonypi_device.input_jog_dev = jog_dev = input_allocate_device();
 	if (!jog_dev)
@@ -1219,9 +1185,8 @@ static int __devinit sonypi_create_input_devices(void)
 
 	sonypi_device.input_key_dev = key_dev = input_allocate_device();
 	if (!key_dev) {
-		input_free_device(jog_dev);
-		sonypi_device.input_jog_dev = NULL;
-		return -ENOMEM;
+		error = -ENOMEM;
+		goto err_free_jogdev;
 	}
 
 	key_dev->name = "Sony Vaio Keys";
@@ -1234,56 +1199,122 @@ static int __devinit sonypi_create_input_devices(void)
 		if (sonypi_inputkeys[i].inputev)
 			set_bit(sonypi_inputkeys[i].inputev, key_dev->keybit);
 
-	input_register_device(jog_dev);
-	input_register_device(key_dev);
+	error = input_register_device(jog_dev);
+	if (error)
+		goto err_free_keydev;
+
+	error = input_register_device(key_dev);
+	if (error)
+		goto err_unregister_jogdev;
 
 	return 0;
+
+ err_unregister_jogdev:
+	input_unregister_device(jog_dev);
+	/* Set to NULL so we don't free it again below */
+	jog_dev = NULL;
+ err_free_keydev:
+	input_free_device(key_dev);
+	sonypi_device.input_key_dev = NULL;
+ err_free_jogdev:
+	input_free_device(jog_dev);
+	sonypi_device.input_jog_dev = NULL;
+
+	return error;
 }
 
-static int __devinit sonypi_probe(void)
+static int __devinit sonypi_setup_ioports(struct sonypi_device *dev,
+				const struct sonypi_ioport_list *ioport_list)
 {
-	int i, ret;
-	struct sonypi_ioport_list *ioport_list;
-	struct sonypi_irq_list *irq_list;
-	struct pci_dev *pcidev;
+	while (ioport_list->port1) {
 
-	if ((pcidev = pci_get_device(PCI_VENDOR_ID_INTEL,
-				     PCI_DEVICE_ID_INTEL_82371AB_3, NULL)))
-		sonypi_device.model = SONYPI_DEVICE_MODEL_TYPE1;
-	else if ((pcidev = pci_get_device(PCI_VENDOR_ID_INTEL,
-					  PCI_DEVICE_ID_INTEL_ICH6_1, NULL)))
-		sonypi_device.model = SONYPI_DEVICE_MODEL_TYPE3;
-	else
-		sonypi_device.model = SONYPI_DEVICE_MODEL_TYPE2;
+		if (request_region(ioport_list->port1,
+				   sonypi_device.region_size,
+				   "Sony Programable I/O Device")) {
+			dev->ioport1 = ioport_list->port1;
+			dev->ioport2 = ioport_list->port2;
+			return 0;
+		}
+		ioport_list++;
+	}
 
-	sonypi_device.dev = pcidev;
+	return -EBUSY;
+}
+
+static int __devinit sonypi_setup_irq(struct sonypi_device *dev,
+				      const struct sonypi_irq_list *irq_list)
+{
+	while (irq_list->irq) {
+
+		if (!request_irq(irq_list->irq, sonypi_irq,
+				 SA_SHIRQ, "sonypi", sonypi_irq)) {
+			dev->irq = irq_list->irq;
+			dev->bits = irq_list->bits;
+			return 0;
+		}
+		irq_list++;
+	}
+
+	return -EBUSY;
+}
+
+static void __devinit sonypi_display_info(void)
+{
+	printk(KERN_INFO "sonypi: detected type%d model, "
+	       "verbose = %d, fnkeyinit = %s, camera = %s, "
+	       "compat = %s, mask = 0x%08lx, useinput = %s, acpi = %s\n",
+	       sonypi_device.model,
+	       verbose,
+	       fnkeyinit ? "on" : "off",
+	       camera ? "on" : "off",
+	       compat ? "on" : "off",
+	       mask,
+	       useinput ? "on" : "off",
+	       SONYPI_ACPI_ACTIVE ? "on" : "off");
+	printk(KERN_INFO "sonypi: enabled at irq=%d, port1=0x%x, port2=0x%x\n",
+	       sonypi_device.irq,
+	       sonypi_device.ioport1, sonypi_device.ioport2);
+
+	if (minor == -1)
+		printk(KERN_INFO "sonypi: device allocated minor is %d\n",
+		       sonypi_misc_device.minor);
+}
+
+static int __devinit sonypi_probe(struct platform_device *dev)
+{
+	const struct sonypi_ioport_list *ioport_list;
+	const struct sonypi_irq_list *irq_list;
+	struct pci_dev *pcidev;
+	int error;
 
 	spin_lock_init(&sonypi_device.fifo_lock);
 	sonypi_device.fifo = kfifo_alloc(SONYPI_BUF_SIZE, GFP_KERNEL,
 					 &sonypi_device.fifo_lock);
 	if (IS_ERR(sonypi_device.fifo)) {
 		printk(KERN_ERR "sonypi: kfifo_alloc failed\n");
-		ret = PTR_ERR(sonypi_device.fifo);
-		goto out_fifo;
+		return PTR_ERR(sonypi_device.fifo);
 	}
 
 	init_waitqueue_head(&sonypi_device.fifo_proc_list);
 	init_MUTEX(&sonypi_device.lock);
 	sonypi_device.bluetooth_power = -1;
 
+	if ((pcidev = pci_get_device(PCI_VENDOR_ID_INTEL,
+				     PCI_DEVICE_ID_INTEL_82371AB_3, NULL)))
+		sonypi_device.model = SONYPI_DEVICE_MODEL_TYPE1;
+	else if ((pcidev = pci_get_device(PCI_VENDOR_ID_INTEL,
+					  PCI_DEVICE_ID_INTEL_ICH6_1, NULL)))
+		sonypi_device.model = SONYPI_DEVICE_MODEL_TYPE3;
+	else
+		sonypi_device.model = SONYPI_DEVICE_MODEL_TYPE2;
+
 	if (pcidev && pci_enable_device(pcidev)) {
 		printk(KERN_ERR "sonypi: pci_enable_device failed\n");
-		ret = -EIO;
-		goto out_pcienable;
-	}
-
-	if (minor != -1)
-		sonypi_misc_device.minor = minor;
-	if ((ret = misc_register(&sonypi_misc_device))) {
-		printk(KERN_ERR "sonypi: misc_register failed\n");
-		goto out_miscreg;
+		error = -EIO;
+		goto err_put_pcidev;
 	}
 
+	sonypi_device.dev = pcidev;
 
 	if (sonypi_device.model == SONYPI_DEVICE_MODEL_TYPE1) {
 		ioport_list = sonypi_type1_ioport_list;
@@ -1302,43 +1333,36 @@ static int __devinit sonypi_probe(void)
 		irq_list = sonypi_type3_irq_list;
 	}
 
-	for (i = 0; ioport_list[i].port1; i++) {
-		if (request_region(ioport_list[i].port1,
-				   sonypi_device.region_size,
-				   "Sony Programable I/O Device")) {
-			/* get the ioport */
-			sonypi_device.ioport1 = ioport_list[i].port1;
-			sonypi_device.ioport2 = ioport_list[i].port2;
-			break;
-		}
-	}
-	if (!sonypi_device.ioport1) {
-		printk(KERN_ERR "sonypi: request_region failed\n");
-		ret = -ENODEV;
-		goto out_reqreg;
+	error = sonypi_setup_ioports(&sonypi_device, ioport_list);
+	if (error) {
+		printk(KERN_ERR "sonypi: failed to request ioports\n");
+		goto err_disable_pcidev;
 	}
 
-	for (i = 0; irq_list[i].irq; i++) {
-
-		sonypi_device.irq = irq_list[i].irq;
-		sonypi_device.bits = irq_list[i].bits;
-
-		if (!request_irq(sonypi_device.irq, sonypi_irq,
-				 SA_SHIRQ, "sonypi", sonypi_irq))
-			break;
+	error = sonypi_setup_irq(&sonypi_device, irq_list);
+	if (error) {
+		printk(KERN_ERR "sonypi: request_irq failed\n");
+		goto err_free_ioports;
 	}
 
-	if (!irq_list[i].irq) {
-		printk(KERN_ERR "sonypi: request_irq failed\n");
-		ret = -ENODEV;
-		goto out_reqirq;
+	if (minor != -1)
+		sonypi_misc_device.minor = minor;
+	error = misc_register(&sonypi_misc_device);
+	if (error) {
+		printk(KERN_ERR "sonypi: misc_register failed\n");
+		goto err_free_irq;
 	}
 
+	sonypi_display_info();
+
 	if (useinput) {
 
-		ret = sonypi_create_input_devices();
-		if (ret)
-			goto out_inputdevices;
+		error = sonypi_create_input_devices();
+		if (error) {
+			printk(KERN_ERR
+				"sonypi: failed to create input devices\n");
+			goto err_miscdev_unregister;
+		}
 
 		spin_lock_init(&sonypi_device.input_fifo_lock);
 		sonypi_device.input_fifo =
@@ -1346,91 +1370,104 @@ static int __devinit sonypi_probe(void)
 				    &sonypi_device.input_fifo_lock);
 		if (IS_ERR(sonypi_device.input_fifo)) {
 			printk(KERN_ERR "sonypi: kfifo_alloc failed\n");
-			ret = PTR_ERR(sonypi_device.input_fifo);
-			goto out_infifo;
+			error = PTR_ERR(sonypi_device.input_fifo);
+			goto err_inpdev_unregister;
 		}
 
 		INIT_WORK(&sonypi_device.input_work, input_keyrelease, NULL);
 	}
 
-	sonypi_device.pdev = platform_device_register_simple("sonypi", -1,
-							     NULL, 0);
-	if (IS_ERR(sonypi_device.pdev)) {
-		ret = PTR_ERR(sonypi_device.pdev);
-		goto out_platformdev;
-	}
-
 	sonypi_enable(0);
 
-	printk(KERN_INFO "sonypi: Sony Programmable I/O Controller Driver"
-	       "v%s.\n", SONYPI_DRIVER_VERSION);
-	printk(KERN_INFO "sonypi: detected type%d model, "
-	       "verbose = %d, fnkeyinit = %s, camera = %s, "
-	       "compat = %s, mask = 0x%08lx, useinput = %s, acpi = %s\n",
-	       sonypi_device.model,
-	       verbose,
-	       fnkeyinit ? "on" : "off",
-	       camera ? "on" : "off",
-	       compat ? "on" : "off",
-	       mask,
-	       useinput ? "on" : "off",
-	       SONYPI_ACPI_ACTIVE ? "on" : "off");
-	printk(KERN_INFO "sonypi: enabled at irq=%d, port1=0x%x, port2=0x%x\n",
-	       sonypi_device.irq,
-	       sonypi_device.ioport1, sonypi_device.ioport2);
-
-	if (minor == -1)
-		printk(KERN_INFO "sonypi: device allocated minor is %d\n",
-		       sonypi_misc_device.minor);
-
 	return 0;
 
-out_platformdev:
-	kfifo_free(sonypi_device.input_fifo);
-out_infifo:
+ err_inpdev_unregister:
 	input_unregister_device(sonypi_device.input_key_dev);
 	input_unregister_device(sonypi_device.input_jog_dev);
-out_inputdevices:
+ err_miscdev_unregister:
+	misc_deregister(&sonypi_misc_device);
+ err_free_irq:
 	free_irq(sonypi_device.irq, sonypi_irq);
-out_reqirq:
+ err_free_ioports:
 	release_region(sonypi_device.ioport1, sonypi_device.region_size);
-out_reqreg:
-	misc_deregister(&sonypi_misc_device);
-out_miscreg:
+ err_disable_pcidev:
 	if (pcidev)
 		pci_disable_device(pcidev);
-out_pcienable:
+ err_put_pcidev:
+	pci_dev_put(pcidev);
 	kfifo_free(sonypi_device.fifo);
-out_fifo:
-	pci_dev_put(sonypi_device.dev);
-	return ret;
+
+	return error;
 }
 
-static void __devexit sonypi_remove(void)
+static int __devexit sonypi_remove(struct platform_device *dev)
 {
 	sonypi_disable();
 
 	synchronize_sched();  /* Allow sonypi interrupt to complete. */
 	flush_scheduled_work();
 
-	platform_device_unregister(sonypi_device.pdev);
-
 	if (useinput) {
 		input_unregister_device(sonypi_device.input_key_dev);
 		input_unregister_device(sonypi_device.input_jog_dev);
 		kfifo_free(sonypi_device.input_fifo);
 	}
 
+	misc_deregister(&sonypi_misc_device);
+
 	free_irq(sonypi_device.irq, sonypi_irq);
 	release_region(sonypi_device.ioport1, sonypi_device.region_size);
-	misc_deregister(&sonypi_misc_device);
-	if (sonypi_device.dev)
+
+	if (sonypi_device.dev) {
 		pci_disable_device(sonypi_device.dev);
+		pci_dev_put(sonypi_device.dev);
+	}
+
 	kfifo_free(sonypi_device.fifo);
-	pci_dev_put(sonypi_device.dev);
-	printk(KERN_INFO "sonypi: removed.\n");
+
+	return 0;
+}
+
+#ifdef CONFIG_PM
+static int old_camera_power;
+
+static int sonypi_suspend(struct platform_device *dev, pm_message_t state)
+{
+	old_camera_power = sonypi_device.camera_power;
+	sonypi_disable();
+
+	return 0;
+}
+
+static int sonypi_resume(struct platform_device *dev)
+{
+	sonypi_enable(old_camera_power);
+	return 0;
+}
+#else
+#define sonypi_suspend	NULL
+#define sonypi_resume	NULL
+#endif
+
+static void sonypi_shutdown(struct platform_device *dev)
+{
+	sonypi_disable();
 }
 
+static struct platform_driver sonypi_driver = {
+	.driver		= {
+		.name	= "sonypi",
+		.owner	= THIS_MODULE,
+	},
+	.probe		= sonypi_probe,
+	.remove		= __devexit_p(sonypi_remove),
+	.shutdown	= sonypi_shutdown,
+	.suspend	= sonypi_suspend,
+	.resume		= sonypi_resume,
+};
+
+static struct platform_device *sonypi_platform_device;
+
 static struct dmi_system_id __initdata sonypi_dmi_table[] = {
 	{
 		.ident = "Sony Vaio",
@@ -1451,26 +1488,43 @@ static struct dmi_system_id __initdata sonypi_dmi_table[] = {
 
 static int __init sonypi_init(void)
 {
-	int ret;
+	int error;
+
+	printk(KERN_INFO
+		"sonypi: Sony Programmable I/O Controller Driver v%s.\n",
+		SONYPI_DRIVER_VERSION);
 
 	if (!dmi_check_system(sonypi_dmi_table))
 		return -ENODEV;
 
-	ret = platform_driver_register(&sonypi_driver);
-	if (ret)
-		return ret;
+	error = platform_driver_register(&sonypi_driver);
+	if (error)
+		return error;
 
-	ret = sonypi_probe();
-	if (ret)
-		platform_driver_unregister(&sonypi_driver);
+	sonypi_platform_device = platform_device_alloc("sonypi", -1);
+	if (!sonypi_platform_device) {
+		error = -ENOMEM;
+		goto err_driver_unregister;
+	}
 
-	return ret;
+	error = platform_device_add(sonypi_platform_device);
+	if (error)
+		goto err_free_device;
+
+	return 0;
+
+ err_free_device:
+	platform_device_put(sonypi_platform_device);
+ err_driver_unregister:
+	platform_driver_unregister(&sonypi_driver);
+	return error;
 }
 
 static void __exit sonypi_exit(void)
 {
+	platform_device_unregister(sonypi_platform_device);
 	platform_driver_unregister(&sonypi_driver);
-	sonypi_remove();
+	printk(KERN_INFO "sonypi: removed.\n");
 }
 
 module_init(sonypi_init);
-- 
cgit v1.2.2


From b368fae6abaa1736b8f26128c32947d47237b8e5 Mon Sep 17 00:00:00 2001
From: Ben Collins <bcollins@ubuntu.com>
Date: Sun, 8 Jan 2006 01:04:24 -0800
Subject: [PATCH] sonypi: Enable ACPI events for Sony laptop hotkeys

Signed-off-by: Ben Collins <bcollins@ubuntu.com>
Cc: "Brown, Len" <len.brown@intel.com>
Cc: linux-acpi@vger.kernel.org
Cc: Stelian Pop <stelian@popies.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/sonypi.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/drivers/char/sonypi.c b/drivers/char/sonypi.c
index 6a9e23dc48..f8dd8527c6 100644
--- a/drivers/char/sonypi.c
+++ b/drivers/char/sonypi.c
@@ -510,6 +510,11 @@ static struct sonypi_device {
 #define SONYPI_ACPI_ACTIVE 0
 #endif				/* CONFIG_ACPI */
 
+#ifdef CONFIG_ACPI
+static struct acpi_device *sonypi_acpi_device;
+static int acpi_enabled;
+#endif
+
 static int sonypi_ec_write(u8 addr, u8 value)
 {
 #ifdef CONFIG_ACPI_EC
@@ -863,6 +868,11 @@ found:
 	if (useinput)
 		sonypi_report_input_event(event);
 
+#ifdef CONFIG_ACPI
+	if (acpi_enabled)
+		acpi_bus_generate_event(sonypi_acpi_device, 1, event);
+#endif
+
 	kfifo_put(sonypi_device.fifo, (unsigned char *)&event, sizeof(event));
 	kill_fasync(&sonypi_device.fifo_async, SIGIO, POLL_IN);
 	wake_up_interruptible(&sonypi_device.fifo_proc_list);
@@ -1164,6 +1174,32 @@ static int sonypi_disable(void)
 	return 0;
 }
 
+#ifdef CONFIG_ACPI
+static int sonypi_acpi_add(struct acpi_device *device)
+{
+	sonypi_acpi_device = device;
+	strcpy(acpi_device_name(device), "Sony laptop hotkeys");
+	strcpy(acpi_device_class(device), "sony/hotkey");
+	return 0;
+}
+
+static int sonypi_acpi_remove(struct acpi_device *device, int type)
+{
+	sonypi_acpi_device = NULL;
+	return 0;
+}
+
+static struct acpi_driver sonypi_acpi_driver = {
+	.name           = "sonypi",
+	.class          = "hkey",
+	.ids            = "SNY6001",
+	.ops            = {
+		           .add = sonypi_acpi_add,
+			   .remove = sonypi_acpi_remove,
+	},
+};
+#endif
+
 static int __devinit sonypi_create_input_devices(void)
 {
 	struct input_dev *jog_dev;
@@ -1511,6 +1547,11 @@ static int __init sonypi_init(void)
 	if (error)
 		goto err_free_device;
 
+#ifdef CONFIG_ACPI
+	if (acpi_bus_register_driver(&sonypi_acpi_driver) > 0)
+		acpi_enabled = 1;
+#endif
+
 	return 0;
 
  err_free_device:
@@ -1522,6 +1563,10 @@ static int __init sonypi_init(void)
 
 static void __exit sonypi_exit(void)
 {
+#ifdef CONFIG_ACPI
+	if (acpi_enabled)
+		acpi_bus_unregister_driver(&sonypi_acpi_driver);
+#endif
 	platform_device_unregister(sonypi_platform_device);
 	platform_driver_unregister(&sonypi_driver);
 	printk(KERN_INFO "sonypi: removed.\n");
-- 
cgit v1.2.2


From eea8b54dc0dceb740da697a89c54d20dde340306 Mon Sep 17 00:00:00 2001
From: Ashutosh Naik <ashutosh.naik@gmail.com>
Date: Sun, 8 Jan 2006 01:04:25 -0800
Subject: [PATCH] modules: prevent overriding of symbols

Ensure that an exported symbol does not already exist in the kernel or in
some other module's exported symbol table.  This is done by checking the
symbol tables for the exported symbol at the time of loading the module.
Currently this is done after the relocation of the symbol.

Signed-off-by: Ashutosh Naik <ashutosh.naik@gmail.com>
Signed-off-by: Anand Krishnan <anandhkrishnan@yahoo.co.in>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/module.c | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/kernel/module.c b/kernel/module.c
index 5d9078d6f0..fb6f091f59 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1204,6 +1204,39 @@ void *__symbol_get(const char *symbol)
 }
 EXPORT_SYMBOL_GPL(__symbol_get);
 
+/*
+ * Ensure that an exported symbol [global namespace] does not already exist
+ * in the Kernel or in some other modules exported symbol table.
+ */
+static int verify_export_symbols(struct module *mod)
+{
+	const char *name = NULL;
+	unsigned long i, ret = 0;
+	struct module *owner;
+	const unsigned long *crc;
+
+	for (i = 0; i < mod->num_syms; i++)
+	        if (__find_symbol(mod->syms[i].name, &owner, &crc, 1)) {
+			name = mod->syms[i].name;
+			ret = -ENOEXEC;
+			goto dup;
+		}
+
+	for (i = 0; i < mod->num_gpl_syms; i++)
+	        if (__find_symbol(mod->gpl_syms[i].name, &owner, &crc, 1)) {
+			name = mod->gpl_syms[i].name;
+			ret = -ENOEXEC;
+			goto dup;
+		}
+
+dup:
+	if (ret)
+		printk(KERN_ERR "%s: exports duplicate symbol %s (owned by %s)\n",
+			mod->name, name, module_name(owner));
+
+	return ret;
+}
+
 /* Change all symbols so that sh_value encodes the pointer directly. */
 static int simplify_symbols(Elf_Shdr *sechdrs,
 			    unsigned int symindex,
@@ -1772,6 +1805,12 @@ static struct module *load_module(void __user *umod,
 			goto cleanup;
 	}
 
+        /* Find duplicate symbols */
+	err = verify_export_symbols(mod);
+
+	if (err < 0)
+		goto cleanup;
+
   	/* Set up and sort exception table */
 	mod->num_exentries = sechdrs[exindex].sh_size / sizeof(*mod->extable);
 	mod->extable = extable = (void *)sechdrs[exindex].sh_addr;
-- 
cgit v1.2.2


From fb1697933a03ec47d794b38e2a4e3ccc2463fd22 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <mita@miraclelinux.com>
Date: Sun, 8 Jan 2006 01:04:29 -0800
Subject: [PATCH] modules: mark TAINT_FORCED_RMMOD correctly

Currently TAINT_FORCED_RMMOD is totally unused.  Because it is marked as
TAINT_FORCED_MODULE instead when user forced a module unload.  This patch
marks it correctly

Signed-off-by: Akinobu Mita <mita@miraclelinux.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Dave Jones <davej@codemonkey.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/module.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/kernel/module.c b/kernel/module.c
index fb6f091f59..f368812ac0 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -496,15 +496,15 @@ static void module_unload_free(struct module *mod)
 }
 
 #ifdef CONFIG_MODULE_FORCE_UNLOAD
-static inline int try_force(unsigned int flags)
+static inline int try_force_unload(unsigned int flags)
 {
 	int ret = (flags & O_TRUNC);
 	if (ret)
-		add_taint(TAINT_FORCED_MODULE);
+		add_taint(TAINT_FORCED_RMMOD);
 	return ret;
 }
 #else
-static inline int try_force(unsigned int flags)
+static inline int try_force_unload(unsigned int flags)
 {
 	return 0;
 }
@@ -524,7 +524,7 @@ static int __try_stop_module(void *_sref)
 
 	/* If it's not unused, quit unless we are told to block. */
 	if ((sref->flags & O_NONBLOCK) && module_refcount(sref->mod) != 0) {
-		if (!(*sref->forced = try_force(sref->flags)))
+		if (!(*sref->forced = try_force_unload(sref->flags)))
 			return -EWOULDBLOCK;
 	}
 
@@ -609,7 +609,7 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
 	/* If it has an init func, it must have an exit func to unload */
 	if ((mod->init != NULL && mod->exit == NULL)
 	    || mod->unsafe) {
-		forced = try_force(flags);
+		forced = try_force_unload(flags);
 		if (!forced) {
 			/* This module can't be removed */
 			ret = -EBUSY;
-- 
cgit v1.2.2


From 59d9136b9844d3a0376d93c945ab280decedb323 Mon Sep 17 00:00:00 2001
From: Benjamin LaHaise <bcrl@kvack.org>
Date: Sun, 8 Jan 2006 01:04:34 -0800
Subject: [PATCH] aio: reorder kiocb structure elements to make sync iocb setup
 faster

Reorder members of the kiocb structure to make sync kiocb setup faster.  By
setting the elements sequentially, the write combining buffers on the CPU
are able to combine the writes into a single burst, which results in fewer
cache cycles being consumed, freeing them up for other code.  This results
in a 10-20KB/s[*] increase on the bw_unix part of LMbench on my test
system.

* The improvement varies based on what other patches are in the system,
  as there are a number of bottlenecks, so this number is not absolutely
  accurate.

Signed-off-by: Benjamin LaHaise <benjamin.c.lahaise@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/aio.h | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/include/linux/aio.h b/include/linux/aio.h
index 49fd37629e..00c8efa95c 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -94,26 +94,27 @@ struct kiocb {
 	ssize_t			(*ki_retry)(struct kiocb *);
 	void			(*ki_dtor)(struct kiocb *);
 
-	struct list_head	ki_list;	/* the aio core uses this
-						 * for cancellation */
-
 	union {
 		void __user		*user;
 		struct task_struct	*tsk;
 	} ki_obj;
+
 	__u64			ki_user_data;	/* user's data for completion */
+	wait_queue_t		ki_wait;
 	loff_t			ki_pos;
+
+	void			*private;
 	/* State that we remember to be able to restart/retry  */
 	unsigned short		ki_opcode;
 	size_t			ki_nbytes; 	/* copy of iocb->aio_nbytes */
 	char 			__user *ki_buf;	/* remaining iocb->aio_buf */
 	size_t			ki_left; 	/* remaining bytes */
-	wait_queue_t		ki_wait;
 	long			ki_retried; 	/* just for testing */
 	long			ki_kicked; 	/* just for testing */
 	long			ki_queued; 	/* just for testing */
 
-	void			*private;
+	struct list_head	ki_list;	/* the aio core uses this
+						 * for cancellation */
 };
 
 #define is_sync_kiocb(iocb)	((iocb)->ki_key == KIOCB_SYNC_KEY)
@@ -126,6 +127,7 @@ struct kiocb {
 		(x)->ki_filp = (filp);			\
 		(x)->ki_ctx = NULL;			\
 		(x)->ki_cancel = NULL;			\
+		(x)->ki_retry = NULL;			\
 		(x)->ki_dtor = NULL;			\
 		(x)->ki_obj.tsk = tsk;			\
 		(x)->ki_user_data = 0;                  \
-- 
cgit v1.2.2


From 349aef0bc4c7f07d685c977e12d0e2d0b5d0e6db Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 8 Jan 2006 01:04:36 -0800
Subject: [PATCH] shrink struct page

Reduce the size of the pageframe for NR_CPUS>4, CONFIG_PREEMPT back to the
minimal size by unionising both ->private and ->mapping with the pagetable
lock.

It uses an anonymous struct and hence requires gcc-3.x.

Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h | 41 ++++++++++++++++++++++-------------------
 1 file changed, 22 insertions(+), 19 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7ff54242c5..df80e63903 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -223,24 +223,27 @@ struct page {
 					 * & limit reverse map searches.
 					 */
 	union {
-		unsigned long private;	/* Mapping-private opaque data:
-					 * usually used for buffer_heads
-					 * if PagePrivate set; used for
-					 * swp_entry_t if PageSwapCache
-					 * When page is free, this indicates
-					 * order in the buddy system.
-					 */
+	    struct {
+		unsigned long private;		/* Mapping-private opaque data:
+					 	 * usually used for buffer_heads
+						 * if PagePrivate set; used for
+						 * swp_entry_t if PageSwapCache.
+						 * When page is free, this
+						 * indicates order in the buddy
+						 * system.
+						 */
+		struct address_space *mapping;	/* If low bit clear, points to
+						 * inode address_space, or NULL.
+						 * If page mapped as anonymous
+						 * memory, low bit is set, and
+						 * it points to anon_vma object:
+						 * see PAGE_MAPPING_ANON below.
+						 */
+	    };
 #if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
-		spinlock_t ptl;
+	    spinlock_t ptl;
 #endif
-	} u;
-	struct address_space *mapping;	/* If low bit clear, points to
-					 * inode address_space, or NULL.
-					 * If page mapped as anonymous
-					 * memory, low bit is set, and
-					 * it points to anon_vma object:
-					 * see PAGE_MAPPING_ANON below.
-					 */
+	};
 	pgoff_t index;			/* Our offset within mapping. */
 	struct list_head lru;		/* Pageout list, eg. active_list
 					 * protected by zone->lru_lock !
@@ -261,8 +264,8 @@ struct page {
 #endif /* WANT_PAGE_VIRTUAL */
 };
 
-#define page_private(page)		((page)->u.private)
-#define set_page_private(page, v)	((page)->u.private = (v))
+#define page_private(page)		((page)->private)
+#define set_page_private(page, v)	((page)->private = (v))
 
 /*
  * FIXME: take this include out, include page-flags.h in
@@ -815,7 +818,7 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a
  * overflow into the next struct page (as it might with DEBUG_SPINLOCK).
  * When freeing, reset page->mapping so free_pages_check won't complain.
  */
-#define __pte_lockptr(page)	&((page)->u.ptl)
+#define __pte_lockptr(page)	&((page)->ptl)
 #define pte_lock_init(_page)	do {					\
 	spin_lock_init(__pte_lockptr(_page));				\
 } while (0)
-- 
cgit v1.2.2


From eb46996f90a0826921f1a0d81535537a9c7f91b0 Mon Sep 17 00:00:00 2001
From: Ashutosh Naik <ashutosh.naik@gmail.com>
Date: Sun, 8 Jan 2006 01:04:37 -0800
Subject: [PATCH] kernel/module.c: remove redundant spinlock in
 resolve_symbol()

Remove the redundant spinlock in the function resolve_symbol() as we are
not altering the module list, and we already hold the semaphore.

Signed-off-by: Ashutosh Naik <ashutosh.naik@gmail.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/module.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/kernel/module.c b/kernel/module.c
index f368812ac0..e4276046a1 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -958,7 +958,6 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs,
 	unsigned long ret;
 	const unsigned long *crc;
 
-	spin_lock_irq(&modlist_lock);
 	ret = __find_symbol(name, &owner, &crc, mod->license_gplok);
 	if (ret) {
 		/* use_module can fail due to OOM, or module unloading */
@@ -966,7 +965,6 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs,
 		    !use_module(mod, owner))
 			ret = 0;
 	}
-	spin_unlock_irq(&modlist_lock);
 	return ret;
 }
 
-- 
cgit v1.2.2


From a12dea7af93ae83bd868c0dc09367090ead7cc1e Mon Sep 17 00:00:00 2001
From: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Date: Sun, 8 Jan 2006 01:04:49 -0800
Subject: [PATCH] PTRACE_SYSEMU is only for i386 and clashes with other ptrace
 codes of other archs

PTRACE_SYSEMU{,_SINGLESTEP} is actually arch specific, for now, and the
current allocated number clashes with a ptrace code of frv, i.e.
PTRACE_GETFDPIC.  I should have submitted this much earlier, anyway we get no
breakage for this.

CC: Daniel Jacobowitz <dan@debian.org>
Signed-off-by: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/ptrace.h | 3 +++
 include/linux/ptrace.h    | 2 --
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/asm-i386/ptrace.h b/include/asm-i386/ptrace.h
index 7e0f2945d1..f324c53b6f 100644
--- a/include/asm-i386/ptrace.h
+++ b/include/asm-i386/ptrace.h
@@ -54,6 +54,9 @@ struct pt_regs {
 #define PTRACE_GET_THREAD_AREA    25
 #define PTRACE_SET_THREAD_AREA    26
 
+#define PTRACE_SYSEMU		  31
+#define PTRACE_SYSEMU_SINGLESTEP  32
+
 #ifdef __KERNEL__
 
 #include <asm/vm86.h>
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 864791996b..9d5cd106b3 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -20,8 +20,6 @@
 #define PTRACE_DETACH		0x11
 
 #define PTRACE_SYSCALL		  24
-#define PTRACE_SYSEMU		  31
-#define PTRACE_SYSEMU_SINGLESTEP  32
 
 /* 0x4200-0x4300 are reserved for architecture-independent additions.  */
 #define PTRACE_SETOPTIONS	0x4200
-- 
cgit v1.2.2


From ac34dd052400b1e63aa8e711a13c0670943296fd Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@linux-mips.org>
Date: Sun, 8 Jan 2006 01:04:50 -0800
Subject: [PATCH] fs/smbfs/proc.c: fix data corruption in
 smb_proc_setattr_unix()

This patch fixes a data corruption in smb_proc_setattr_unix()
(smb_filetype_from_mode() returns an u32, and there are only four bytes
reserved for it in data.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/smbfs/proc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/smbfs/proc.c b/fs/smbfs/proc.c
index 38ab558835..d6baec0f24 100644
--- a/fs/smbfs/proc.c
+++ b/fs/smbfs/proc.c
@@ -3113,7 +3113,7 @@ smb_proc_setattr_unix(struct dentry *d, struct iattr *attr,
 	LSET(data, 32, SMB_TIME_NO_CHANGE);
 	LSET(data, 40, SMB_UID_NO_CHANGE);
 	LSET(data, 48, SMB_GID_NO_CHANGE);
-	LSET(data, 56, smb_filetype_from_mode(attr->ia_mode));
+	DSET(data, 56, smb_filetype_from_mode(attr->ia_mode));
 	LSET(data, 60, major);
 	LSET(data, 68, minor);
 	LSET(data, 76, 0);
-- 
cgit v1.2.2


From 15b2fe3931831891a62bad9cafd403c169ae087c Mon Sep 17 00:00:00 2001
From: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
Date: Sun, 8 Jan 2006 01:04:51 -0800
Subject: [PATCH] UFS: inode->i_sem is not released in error path

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ufs/super.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 54828ebcf1..2ba11a9aa9 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1296,8 +1296,10 @@ static ssize_t ufs_quota_write(struct super_block *sb, int type,
 		blk++;
 	}
 out:
-	if (len == towrite)
+	if (len == towrite) {
+		up(&inode->i_sem);
 		return err;
+	}
 	if (inode->i_size < off+len-towrite)
 		i_size_write(inode, off+len-towrite);
 	inode->i_version++;
-- 
cgit v1.2.2


From 58591e8a1f4de2e53fdd09b2aa86948687106c07 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Sun, 8 Jan 2006 01:04:52 -0800
Subject: [PATCH] SubmittingPatches: diffstat options

Add desired 'diffstat' options to use for kernel patches.

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/SubmittingPatches | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Documentation/SubmittingPatches b/Documentation/SubmittingPatches
index 3c12fc1469..6198e5ebcf 100644
--- a/Documentation/SubmittingPatches
+++ b/Documentation/SubmittingPatches
@@ -386,6 +386,9 @@ a diffstat, to show what files have changed, and the number of inserted
 and deleted lines per file.  A diffstat is especially useful on bigger
 patches.  Other comments relevant only to the moment or the maintainer,
 not suitable for the permanent changelog, should also go here.
+Use diffstat options "-p 1 -w 70" so that filenames are listed from the
+top of the kernel source tree and don't use too much horizontal space
+(easily fit in 80 columns, maybe with some indentation).
 
 See more details on the proper patch format in the following
 references.
-- 
cgit v1.2.2


From 389cd6ec086e9b5421fe479d14d20a1faf74848d Mon Sep 17 00:00:00 2001
From: Eugene Surovegin <ebs@ebshome.net>
Date: Sun, 8 Jan 2006 01:04:53 -0800
Subject: [PATCH] CREDITS update: Eugene Surovegin

Add EMAC to my CREDITS entry.

Signed-off-by: Eugene Surovegin <ebs@ebshome.net
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 CREDITS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CREDITS b/CREDITS
index 521f00d1b5..8e577ce4ab 100644
--- a/CREDITS
+++ b/CREDITS
@@ -3203,7 +3203,7 @@ N: Eugene Surovegin
 E: ebs@ebshome.net
 W: http://kernel.ebshome.net/
 P: 1024D/AE5467F1 FF22 39F1 6728 89F6 6E6C  2365 7602 F33D AE54 67F1
-D: Embedded PowerPC 4xx: I2C, PIC and random hacks/fixes
+D: Embedded PowerPC 4xx: EMAC, I2C, PIC and random hacks/fixes
 S: Sunnyvale, California 94085
 S: USA
 
-- 
cgit v1.2.2


From 7e7f358c8f8f836c504faa293fda0c1c0733b63c Mon Sep 17 00:00:00 2001
From: Brian Gerst <bgerst@didntduck.org>
Date: Sun, 8 Jan 2006 01:04:54 -0800
Subject: [PATCH] Split out screen_info from tty.h

This makes it possible for boot code to use screen_info without dragging in
all of tty.h.

Signed-off-by: Brian Gerst <bgerst@didntduck.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/boot/compressed/misc.c        |  2 +-
 arch/x86_64/boot/compressed/misc.c      |  2 +-
 arch/x86_64/boot/compressed/miscsetup.h | 39 -----------------
 include/linux/screen_info.h             | 77 +++++++++++++++++++++++++++++++++
 include/linux/tty.h                     | 72 +-----------------------------
 5 files changed, 80 insertions(+), 112 deletions(-)
 delete mode 100644 arch/x86_64/boot/compressed/miscsetup.h
 create mode 100644 include/linux/screen_info.h

diff --git a/arch/i386/boot/compressed/misc.c b/arch/i386/boot/compressed/misc.c
index 82a807f9f5..f19f3a7492 100644
--- a/arch/i386/boot/compressed/misc.c
+++ b/arch/i386/boot/compressed/misc.c
@@ -11,7 +11,7 @@
 
 #include <linux/linkage.h>
 #include <linux/vmalloc.h>
-#include <linux/tty.h>
+#include <linux/screen_info.h>
 #include <asm/io.h>
 #include <asm/page.h>
 
diff --git a/arch/x86_64/boot/compressed/misc.c b/arch/x86_64/boot/compressed/misc.c
index 0e10fd84c7..cf4b88c416 100644
--- a/arch/x86_64/boot/compressed/misc.c
+++ b/arch/x86_64/boot/compressed/misc.c
@@ -9,7 +9,7 @@
  * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
  */
 
-#include "miscsetup.h"
+#include <linux/screen_info.h>
 #include <asm/io.h>
 #include <asm/page.h>
 
diff --git a/arch/x86_64/boot/compressed/miscsetup.h b/arch/x86_64/boot/compressed/miscsetup.h
deleted file mode 100644
index bb16205317..0000000000
--- a/arch/x86_64/boot/compressed/miscsetup.h
+++ /dev/null
@@ -1,39 +0,0 @@
-#define NULL 0
-//typedef unsigned int size_t; 
-
-
-struct screen_info {
-	unsigned char  orig_x;			/* 0x00 */
-	unsigned char  orig_y;			/* 0x01 */
-	unsigned short dontuse1;		/* 0x02 -- EXT_MEM_K sits here */
-	unsigned short orig_video_page;		/* 0x04 */
-	unsigned char  orig_video_mode;		/* 0x06 */
-	unsigned char  orig_video_cols;		/* 0x07 */
-	unsigned short unused2;			/* 0x08 */
-	unsigned short orig_video_ega_bx;	/* 0x0a */
-	unsigned short unused3;			/* 0x0c */
-	unsigned char  orig_video_lines;	/* 0x0e */
-	unsigned char  orig_video_isVGA;	/* 0x0f */
-	unsigned short orig_video_points;	/* 0x10 */
-
-	/* VESA graphic mode -- linear frame buffer */
-	unsigned short lfb_width;		/* 0x12 */
-	unsigned short lfb_height;		/* 0x14 */
-	unsigned short lfb_depth;		/* 0x16 */
-	unsigned long  lfb_base;		/* 0x18 */
-	unsigned long  lfb_size;		/* 0x1c */
-	unsigned short dontuse2, dontuse3;	/* 0x20 -- CL_MAGIC and CL_OFFSET here */
-	unsigned short lfb_linelength;		/* 0x24 */
-	unsigned char  red_size;		/* 0x26 */
-	unsigned char  red_pos;			/* 0x27 */
-	unsigned char  green_size;		/* 0x28 */
-	unsigned char  green_pos;		/* 0x29 */
-	unsigned char  blue_size;		/* 0x2a */
-	unsigned char  blue_pos;		/* 0x2b */
-	unsigned char  rsvd_size;		/* 0x2c */
-	unsigned char  rsvd_pos;		/* 0x2d */
-	unsigned short vesapm_seg;		/* 0x2e */
-	unsigned short vesapm_off;		/* 0x30 */
-	unsigned short pages;			/* 0x32 */
-						/* 0x34 -- 0x3f reserved for future expansion */
-};
diff --git a/include/linux/screen_info.h b/include/linux/screen_info.h
new file mode 100644
index 0000000000..76850b75b3
--- /dev/null
+++ b/include/linux/screen_info.h
@@ -0,0 +1,77 @@
+#ifndef _SCREEN_INFO_H
+#define _SCREEN_INFO_H
+
+#include <linux/types.h>
+
+/*
+ * These are set up by the setup-routine at boot-time:
+ */
+
+struct screen_info {
+	u8  orig_x;		/* 0x00 */
+	u8  orig_y;		/* 0x01 */
+	u16 dontuse1;		/* 0x02 -- EXT_MEM_K sits here */
+	u16 orig_video_page;	/* 0x04 */
+	u8  orig_video_mode;	/* 0x06 */
+	u8  orig_video_cols;	/* 0x07 */
+	u16 unused2;		/* 0x08 */
+	u16 orig_video_ega_bx;	/* 0x0a */
+	u16 unused3;		/* 0x0c */
+	u8  orig_video_lines;	/* 0x0e */
+	u8  orig_video_isVGA;	/* 0x0f */
+	u16 orig_video_points;	/* 0x10 */
+
+	/* VESA graphic mode -- linear frame buffer */
+	u16 lfb_width;		/* 0x12 */
+	u16 lfb_height;		/* 0x14 */
+	u16 lfb_depth;		/* 0x16 */
+	u32 lfb_base;		/* 0x18 */
+	u32 lfb_size;		/* 0x1c */
+	u16 dontuse2, dontuse3;	/* 0x20 -- CL_MAGIC and CL_OFFSET here */
+	u16 lfb_linelength;	/* 0x24 */
+	u8  red_size;		/* 0x26 */
+	u8  red_pos;		/* 0x27 */
+	u8  green_size;		/* 0x28 */
+	u8  green_pos;		/* 0x29 */
+	u8  blue_size;		/* 0x2a */
+	u8  blue_pos;		/* 0x2b */
+	u8  rsvd_size;		/* 0x2c */
+	u8  rsvd_pos;		/* 0x2d */
+	u16 vesapm_seg;		/* 0x2e */
+	u16 vesapm_off;		/* 0x30 */
+	u16 pages;		/* 0x32 */
+	u16 vesa_attributes;	/* 0x34 */
+	u32  capabilities;      /* 0x36 */
+				/* 0x3a -- 0x3f reserved for future expansion */
+};
+
+extern struct screen_info screen_info;
+
+#define ORIG_X			(screen_info.orig_x)
+#define ORIG_Y			(screen_info.orig_y)
+#define ORIG_VIDEO_MODE		(screen_info.orig_video_mode)
+#define ORIG_VIDEO_COLS 	(screen_info.orig_video_cols)
+#define ORIG_VIDEO_EGA_BX	(screen_info.orig_video_ega_bx)
+#define ORIG_VIDEO_LINES	(screen_info.orig_video_lines)
+#define ORIG_VIDEO_ISVGA	(screen_info.orig_video_isVGA)
+#define ORIG_VIDEO_POINTS       (screen_info.orig_video_points)
+
+#define VIDEO_TYPE_MDA		0x10	/* Monochrome Text Display	*/
+#define VIDEO_TYPE_CGA		0x11	/* CGA Display 			*/
+#define VIDEO_TYPE_EGAM		0x20	/* EGA/VGA in Monochrome Mode	*/
+#define VIDEO_TYPE_EGAC		0x21	/* EGA in Color Mode		*/
+#define VIDEO_TYPE_VGAC		0x22	/* VGA+ in Color Mode		*/
+#define VIDEO_TYPE_VLFB		0x23	/* VESA VGA in graphic mode	*/
+
+#define VIDEO_TYPE_PICA_S3	0x30	/* ACER PICA-61 local S3 video	*/
+#define VIDEO_TYPE_MIPS_G364	0x31    /* MIPS Magnum 4000 G364 video  */
+#define VIDEO_TYPE_SGI          0x33    /* Various SGI graphics hardware */
+
+#define VIDEO_TYPE_TGAC		0x40	/* DEC TGA */
+
+#define VIDEO_TYPE_SUN          0x50    /* Sun frame buffer. */
+#define VIDEO_TYPE_SUNPCI       0x51    /* Sun PCI based frame buffer. */
+
+#define VIDEO_TYPE_PMAC		0x60	/* PowerMacintosh frame buffer. */
+
+#endif /* _SCREEN_INFO_H */
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 1267f88ece..57449704a4 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -23,6 +23,7 @@
 #include <linux/workqueue.h>
 #include <linux/tty_driver.h>
 #include <linux/tty_ldisc.h>
+#include <linux/screen_info.h>
 
 #include <asm/system.h>
 
@@ -36,77 +37,6 @@
 #define NR_UNIX98_PTY_MAX	(1 << MINORBITS) /* Absolute limit */
 #define NR_LDISCS		16
 
-/*
- * These are set up by the setup-routine at boot-time:
- */
-
-struct screen_info {
-	u8  orig_x;		/* 0x00 */
-	u8  orig_y;		/* 0x01 */
-	u16 dontuse1;		/* 0x02 -- EXT_MEM_K sits here */
-	u16 orig_video_page;	/* 0x04 */
-	u8  orig_video_mode;	/* 0x06 */
-	u8  orig_video_cols;	/* 0x07 */
-	u16 unused2;		/* 0x08 */
-	u16 orig_video_ega_bx;	/* 0x0a */
-	u16 unused3;		/* 0x0c */
-	u8  orig_video_lines;	/* 0x0e */
-	u8  orig_video_isVGA;	/* 0x0f */
-	u16 orig_video_points;	/* 0x10 */
-
-	/* VESA graphic mode -- linear frame buffer */
-	u16 lfb_width;		/* 0x12 */
-	u16 lfb_height;		/* 0x14 */
-	u16 lfb_depth;		/* 0x16 */
-	u32 lfb_base;		/* 0x18 */
-	u32 lfb_size;		/* 0x1c */
-	u16 dontuse2, dontuse3;	/* 0x20 -- CL_MAGIC and CL_OFFSET here */
-	u16 lfb_linelength;	/* 0x24 */
-	u8  red_size;		/* 0x26 */
-	u8  red_pos;		/* 0x27 */
-	u8  green_size;		/* 0x28 */
-	u8  green_pos;		/* 0x29 */
-	u8  blue_size;		/* 0x2a */
-	u8  blue_pos;		/* 0x2b */
-	u8  rsvd_size;		/* 0x2c */
-	u8  rsvd_pos;		/* 0x2d */
-	u16 vesapm_seg;		/* 0x2e */
-	u16 vesapm_off;		/* 0x30 */
-	u16 pages;		/* 0x32 */
-	u16 vesa_attributes;	/* 0x34 */
-	u32  capabilities;      /* 0x36 */
-				/* 0x3a -- 0x3f reserved for future expansion */
-};
-
-extern struct screen_info screen_info;
-
-#define ORIG_X			(screen_info.orig_x)
-#define ORIG_Y			(screen_info.orig_y)
-#define ORIG_VIDEO_MODE		(screen_info.orig_video_mode)
-#define ORIG_VIDEO_COLS 	(screen_info.orig_video_cols)
-#define ORIG_VIDEO_EGA_BX	(screen_info.orig_video_ega_bx)
-#define ORIG_VIDEO_LINES	(screen_info.orig_video_lines)
-#define ORIG_VIDEO_ISVGA	(screen_info.orig_video_isVGA)
-#define ORIG_VIDEO_POINTS       (screen_info.orig_video_points)
-
-#define VIDEO_TYPE_MDA		0x10	/* Monochrome Text Display	*/
-#define VIDEO_TYPE_CGA		0x11	/* CGA Display 			*/
-#define VIDEO_TYPE_EGAM		0x20	/* EGA/VGA in Monochrome Mode	*/
-#define VIDEO_TYPE_EGAC		0x21	/* EGA in Color Mode		*/
-#define VIDEO_TYPE_VGAC		0x22	/* VGA+ in Color Mode		*/
-#define VIDEO_TYPE_VLFB		0x23	/* VESA VGA in graphic mode	*/
-
-#define VIDEO_TYPE_PICA_S3	0x30	/* ACER PICA-61 local S3 video	*/
-#define VIDEO_TYPE_MIPS_G364	0x31    /* MIPS Magnum 4000 G364 video  */
-#define VIDEO_TYPE_SGI          0x33    /* Various SGI graphics hardware */
-
-#define VIDEO_TYPE_TGAC		0x40	/* DEC TGA */
-
-#define VIDEO_TYPE_SUN          0x50    /* Sun frame buffer. */
-#define VIDEO_TYPE_SUNPCI       0x51    /* Sun PCI based frame buffer. */
-
-#define VIDEO_TYPE_PMAC		0x60	/* PowerMacintosh frame buffer. */
-
 /*
  * This character is the same as _POSIX_VDISABLE: it cannot be used as
  * a c_cc[] character, but indicates that a particular special character
-- 
cgit v1.2.2


From f5ef3c105bee3a52486d7b55cef3330fcde9bca6 Mon Sep 17 00:00:00 2001
From: Eric Van Hensbergen <ericvh@gmail.com>
Date: Sun, 8 Jan 2006 01:04:56 -0800
Subject: [PATCH] v9fs: fix fd_close

If a 9pfs server crashes, v9fs_fd_close() is called.  Subsequently, in
cleaning up by performing a umount() on the FS that was provided by this
server v9fs_fd_close() is called again, and uses the old, freed valus of
trans->priv.  This patch ensures that trans->priv can be freed only once,
otherwise this function bails early.

Signed-off-by: Michal Ostrowski <mostrows@watson.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/9p/trans_fd.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/9p/trans_fd.c b/fs/9p/trans_fd.c
index 63b58ce98f..b7ffb98595 100644
--- a/fs/9p/trans_fd.c
+++ b/fs/9p/trans_fd.c
@@ -148,12 +148,12 @@ static void v9fs_fd_close(struct v9fs_transport *trans)
 	if (!trans)
 		return;
 
-	trans->status = Disconnected;
-	ts = trans->priv;
+	ts = xchg(&trans->priv, NULL);
 
 	if (!ts)
 		return;
 
+	trans->status = Disconnected;
 	if (ts->in_file)
 		fput(ts->in_file);
 
-- 
cgit v1.2.2


From 3cf6429a26da5c4d7b795e6d0f8f56ed2e4fdfc0 Mon Sep 17 00:00:00 2001
From: Latchesar Ionkov <lucho@ionkov.net>
Date: Sun, 8 Jan 2006 01:04:58 -0800
Subject: [PATCH] v9fs: new multiplexer implementation

New multiplexer implementation. Decreases the number of kernel threads
required. Better handling when the user process receives a signal.

Signed-off-by: Latchesar Ionkov <lucho@ionkov.net>
Cc: Eric Van Hensbergen <ericvh@ericvh.myip.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/9p/9p.c         |   68 +++-
 fs/9p/9p.h         |    9 +-
 fs/9p/conv.c       |   86 ++--
 fs/9p/conv.h       |   13 +-
 fs/9p/fid.c        |    2 +-
 fs/9p/mux.c        | 1122 +++++++++++++++++++++++++++++++++++++---------------
 fs/9p/mux.h        |   40 +-
 fs/9p/trans_fd.c   |   49 ++-
 fs/9p/trans_sock.c |  161 +++++---
 fs/9p/transport.h  |    4 +-
 fs/9p/v9fs.c       |   41 +-
 fs/9p/v9fs.h       |   17 +-
 fs/9p/vfs_dentry.c |   13 +-
 fs/9p/vfs_dir.c    |   17 +-
 fs/9p/vfs_inode.c  |   89 ++---
 fs/9p/vfs_super.c  |    3 +-
 16 files changed, 1172 insertions(+), 562 deletions(-)

diff --git a/fs/9p/9p.c b/fs/9p/9p.c
index e847f504a4..a3a1ac6107 100644
--- a/fs/9p/9p.c
+++ b/fs/9p/9p.c
@@ -52,10 +52,11 @@ v9fs_t_version(struct v9fs_session_info *v9ses, u32 msize,
 
 	dprintk(DEBUG_9P, "msize: %d version: %s\n", msize, version);
 	msg.id = TVERSION;
+	msg.tag = ~0;
 	msg.params.tversion.msize = msize;
 	msg.params.tversion.version = version;
 
-	return v9fs_mux_rpc(v9ses, &msg, fcall);
+	return v9fs_mux_rpc(v9ses->mux, &msg, fcall);
 }
 
 /**
@@ -83,7 +84,30 @@ v9fs_t_attach(struct v9fs_session_info *v9ses, char *uname, char *aname,
 	msg.params.tattach.uname = uname;
 	msg.params.tattach.aname = aname;
 
-	return v9fs_mux_rpc(v9ses, &msg, fcall);
+	return v9fs_mux_rpc(v9ses->mux, &msg, fcall);
+}
+
+static void v9fs_t_clunk_cb(void *a, struct v9fs_fcall *tc,
+	struct v9fs_fcall *rc, int err)
+{
+	int fid;
+	struct v9fs_session_info *v9ses;
+
+	if (err)
+		return;
+
+	fid = tc->params.tclunk.fid;
+	kfree(tc);
+
+	if (!rc)
+		return;
+
+	dprintk(DEBUG_9P, "tcall id %d rcall id %d\n", tc->id, rc->id);
+	v9ses = a;
+	if (rc->id == RCLUNK)
+		v9fs_put_idpool(fid, &v9ses->fidpool);
+
+	kfree(rc);
 }
 
 /**
@@ -93,18 +117,24 @@ v9fs_t_attach(struct v9fs_session_info *v9ses, char *uname, char *aname,
  * @fcall: pointer to response fcall pointer
  *
  */
-
 int
-v9fs_t_clunk(struct v9fs_session_info *v9ses, u32 fid,
-	     struct v9fs_fcall **fcall)
+v9fs_t_clunk(struct v9fs_session_info *v9ses, u32 fid)
 {
-	struct v9fs_fcall msg;
+	int err;
+	struct v9fs_fcall *tc, *rc;
+
+	tc = kmalloc(sizeof(struct v9fs_fcall), GFP_KERNEL);
 
 	dprintk(DEBUG_9P, "fid %d\n", fid);
-	msg.id = TCLUNK;
-	msg.params.tclunk.fid = fid;
+	tc->id = TCLUNK;
+	tc->params.tclunk.fid = fid;
 
-	return v9fs_mux_rpc(v9ses, &msg, fcall);
+	err = v9fs_mux_rpc(v9ses->mux, tc, &rc);
+	if (err >= 0) {
+		v9fs_t_clunk_cb(v9ses, tc, rc, 0);
+	}
+
+	return err;
 }
 
 /**
@@ -121,7 +151,7 @@ int v9fs_t_flush(struct v9fs_session_info *v9ses, u16 tag)
 	dprintk(DEBUG_9P, "oldtag %d\n", tag);
 	msg.id = TFLUSH;
 	msg.params.tflush.oldtag = tag;
-	return v9fs_mux_rpc(v9ses, &msg, NULL);
+	return v9fs_mux_rpc(v9ses->mux, &msg, NULL);
 }
 
 /**
@@ -143,7 +173,7 @@ v9fs_t_stat(struct v9fs_session_info *v9ses, u32 fid, struct v9fs_fcall **fcall)
 
 	msg.id = TSTAT;
 	msg.params.tstat.fid = fid;
-	return v9fs_mux_rpc(v9ses, &msg, fcall);
+	return v9fs_mux_rpc(v9ses->mux, &msg, fcall);
 }
 
 /**
@@ -166,7 +196,7 @@ v9fs_t_wstat(struct v9fs_session_info *v9ses, u32 fid,
 	msg.params.twstat.fid = fid;
 	msg.params.twstat.stat = stat;
 
-	return v9fs_mux_rpc(v9ses, &msg, fcall);
+	return v9fs_mux_rpc(v9ses->mux, &msg, fcall);
 }
 
 /**
@@ -199,7 +229,7 @@ v9fs_t_walk(struct v9fs_session_info *v9ses, u32 fid, u32 newfid,
 		msg.params.twalk.nwname = 0;
 	}
 
-	return v9fs_mux_rpc(v9ses, &msg, fcall);
+	return v9fs_mux_rpc(v9ses->mux, &msg, fcall);
 }
 
 /**
@@ -217,14 +247,14 @@ v9fs_t_open(struct v9fs_session_info *v9ses, u32 fid, u8 mode,
 	    struct v9fs_fcall **fcall)
 {
 	struct v9fs_fcall msg;
-	long errorno = -1;
+	int errorno = -1;
 
 	dprintk(DEBUG_9P, "fid %d mode %d\n", fid, mode);
 	msg.id = TOPEN;
 	msg.params.topen.fid = fid;
 	msg.params.topen.mode = mode;
 
-	errorno = v9fs_mux_rpc(v9ses, &msg, fcall);
+	errorno = v9fs_mux_rpc(v9ses->mux, &msg, fcall);
 
 	return errorno;
 }
@@ -246,7 +276,7 @@ v9fs_t_remove(struct v9fs_session_info *v9ses, u32 fid,
 	dprintk(DEBUG_9P, "fid %d\n", fid);
 	msg.id = TREMOVE;
 	msg.params.tremove.fid = fid;
-	return v9fs_mux_rpc(v9ses, &msg, fcall);
+	return v9fs_mux_rpc(v9ses->mux, &msg, fcall);
 }
 
 /**
@@ -275,7 +305,7 @@ v9fs_t_create(struct v9fs_session_info *v9ses, u32 fid, char *name,
 	msg.params.tcreate.perm = perm;
 	msg.params.tcreate.mode = mode;
 
-	return v9fs_mux_rpc(v9ses, &msg, fcall);
+	return v9fs_mux_rpc(v9ses->mux, &msg, fcall);
 }
 
 /**
@@ -302,7 +332,7 @@ v9fs_t_read(struct v9fs_session_info *v9ses, u32 fid, u64 offset,
 	msg.params.tread.fid = fid;
 	msg.params.tread.offset = offset;
 	msg.params.tread.count = count;
-	errorno = v9fs_mux_rpc(v9ses, &msg, &rc);
+	errorno = v9fs_mux_rpc(v9ses->mux, &msg, &rc);
 
 	if (!errorno) {
 		errorno = rc->params.rread.count;
@@ -345,7 +375,7 @@ v9fs_t_write(struct v9fs_session_info *v9ses, u32 fid,
 	msg.params.twrite.count = count;
 	msg.params.twrite.data = data;
 
-	errorno = v9fs_mux_rpc(v9ses, &msg, &rc);
+	errorno = v9fs_mux_rpc(v9ses->mux, &msg, &rc);
 
 	if (!errorno)
 		errorno = rc->params.rwrite.count;
diff --git a/fs/9p/9p.h b/fs/9p/9p.h
index f55424216b..6355392786 100644
--- a/fs/9p/9p.h
+++ b/fs/9p/9p.h
@@ -100,6 +100,9 @@ enum {
 	V9FS_QTFILE = 0x00,
 };
 
+#define V9FS_NOTAG	(u16)(~0)
+#define V9FS_NOFID	(u32)(~0)
+
 /* ample room for Twrite/Rread header (iounit) */
 #define V9FS_IOHDRSZ	24
 
@@ -303,6 +306,9 @@ struct v9fs_fcall {
 	} params;
 };
 
+#define V9FS_FCALLHDRSZ (sizeof(struct v9fs_fcall) + \
+	sizeof(struct v9fs_stat) + 16*sizeof(struct v9fs_qid) + 16)
+
 #define FCALL_ERROR(fcall) (fcall ? fcall->params.rerror.error : "")
 
 int v9fs_t_version(struct v9fs_session_info *v9ses, u32 msize,
@@ -311,8 +317,7 @@ int v9fs_t_version(struct v9fs_session_info *v9ses, u32 msize,
 int v9fs_t_attach(struct v9fs_session_info *v9ses, char *uname, char *aname,
 		  u32 fid, u32 afid, struct v9fs_fcall **rcall);
 
-int v9fs_t_clunk(struct v9fs_session_info *v9ses, u32 fid,
-		 struct v9fs_fcall **rcall);
+int v9fs_t_clunk(struct v9fs_session_info *v9ses, u32 fid);
 
 int v9fs_t_flush(struct v9fs_session_info *v9ses, u16 oldtag);
 
diff --git a/fs/9p/conv.c b/fs/9p/conv.c
index 18121af99d..1b9b15dfea 100644
--- a/fs/9p/conv.c
+++ b/fs/9p/conv.c
@@ -208,7 +208,7 @@ static inline char *buf_get_stringb(struct cbuf *buf, struct cbuf *sbuf)
 	len = buf_get_int16(buf);
 
 	if (!buf_check_overflow(buf) && buf_check_size(buf, len) &&
-		buf_check_size(sbuf, len+1)) {
+		buf_check_size(sbuf, len + 1)) {
 
 		memcpy(sbuf->p, buf->p, len);
 		sbuf->p[len] = 0;
@@ -252,13 +252,12 @@ static inline void *buf_get_datab(struct cbuf *buf, struct cbuf *dbuf,
 
 /**
  * v9fs_size_stat - calculate the size of a variable length stat struct
- * @v9ses: session information
  * @stat: metadata (stat) structure
+ * @extended: non-zero if 9P2000.u
  *
  */
 
-static int v9fs_size_stat(struct v9fs_session_info *v9ses,
-			  struct v9fs_stat *stat)
+static int v9fs_size_stat(struct v9fs_stat *stat, int extended)
 {
 	int size = 0;
 
@@ -288,7 +287,7 @@ static int v9fs_size_stat(struct v9fs_session_info *v9ses,
 	if (stat->muid)
 		size += strlen(stat->muid);
 
-	if (v9ses->extended) {
+	if (extended) {
 		size += 4 +	/* n_uid[4] */
 		    4 +		/* n_gid[4] */
 		    4 +		/* n_muid[4] */
@@ -302,15 +301,14 @@ static int v9fs_size_stat(struct v9fs_session_info *v9ses,
 
 /**
  * serialize_stat - safely format a stat structure for transmission
- * @v9ses: session info
  * @stat: metadata (stat) structure
  * @bufp: buffer to serialize structure into
+ * @extended: non-zero if 9P2000.u
  *
  */
 
 static int
-serialize_stat(struct v9fs_session_info *v9ses, struct v9fs_stat *stat,
-	       struct cbuf *bufp)
+serialize_stat(struct v9fs_stat *stat, struct cbuf *bufp, int extended)
 {
 	buf_put_int16(bufp, stat->size);
 	buf_put_int16(bufp, stat->type);
@@ -328,7 +326,7 @@ serialize_stat(struct v9fs_session_info *v9ses, struct v9fs_stat *stat,
 	buf_put_string(bufp, stat->gid);
 	buf_put_string(bufp, stat->muid);
 
-	if (v9ses->extended) {
+	if (extended) {
 		buf_put_string(bufp, stat->extension);
 		buf_put_int32(bufp, stat->n_uid);
 		buf_put_int32(bufp, stat->n_gid);
@@ -343,16 +341,16 @@ serialize_stat(struct v9fs_session_info *v9ses, struct v9fs_stat *stat,
 
 /**
  * deserialize_stat - safely decode a recieved metadata (stat) structure
- * @v9ses: session info
  * @bufp: buffer to deserialize
  * @stat: metadata (stat) structure
  * @dbufp: buffer to deserialize variable strings into
+ * @extended: non-zero if 9P2000.u
  *
  */
 
 static inline int
-deserialize_stat(struct v9fs_session_info *v9ses, struct cbuf *bufp,
-		 struct v9fs_stat *stat, struct cbuf *dbufp)
+deserialize_stat(struct cbuf *bufp, struct v9fs_stat *stat,
+		 struct cbuf *dbufp, int extended)
 {
 
 	stat->size = buf_get_int16(bufp);
@@ -370,7 +368,7 @@ deserialize_stat(struct v9fs_session_info *v9ses, struct cbuf *bufp,
 	stat->gid = buf_get_stringb(bufp, dbufp);
 	stat->muid = buf_get_stringb(bufp, dbufp);
 
-	if (v9ses->extended) {
+	if (extended) {
 		stat->extension = buf_get_stringb(bufp, dbufp);
 		stat->n_uid = buf_get_int32(bufp);
 		stat->n_gid = buf_get_int32(bufp);
@@ -385,20 +383,20 @@ deserialize_stat(struct v9fs_session_info *v9ses, struct cbuf *bufp,
 
 /**
  * deserialize_statb - wrapper for decoding a received metadata structure
- * @v9ses: session info
  * @bufp: buffer to deserialize
  * @dbufp: buffer to deserialize variable strings into
+ * @extended: non-zero if 9P2000.u
  *
  */
 
-static inline struct v9fs_stat *deserialize_statb(struct v9fs_session_info
-						  *v9ses, struct cbuf *bufp,
-						  struct cbuf *dbufp)
+static inline struct v9fs_stat *deserialize_statb(struct cbuf *bufp,
+						  struct cbuf *dbufp,
+						  int extended)
 {
 	struct v9fs_stat *ret = buf_alloc(dbufp, sizeof(struct v9fs_stat));
 
 	if (ret) {
-		int n = deserialize_stat(v9ses, bufp, ret, dbufp);
+		int n = deserialize_stat(bufp, ret, dbufp, extended);
 		if (n <= 0)
 			return NULL;
 	}
@@ -408,17 +406,16 @@ static inline struct v9fs_stat *deserialize_statb(struct v9fs_session_info
 
 /**
  * v9fs_deserialize_stat - decode a received metadata structure
- * @v9ses: session info
  * @buf: buffer to deserialize
  * @buflen: length of received buffer
  * @stat: metadata structure to decode into
  * @statlen: length of destination metadata structure
+ * @extended: non-zero if 9P2000.u
  *
  */
 
-int
-v9fs_deserialize_stat(struct v9fs_session_info *v9ses, void *buf,
-		      u32 buflen, struct v9fs_stat *stat, u32 statlen)
+int v9fs_deserialize_stat(void *buf, u32 buflen, struct v9fs_stat *stat,
+			  u32 statlen, int extended)
 {
 	struct cbuf buffer;
 	struct cbuf *bufp = &buffer;
@@ -429,11 +426,10 @@ v9fs_deserialize_stat(struct v9fs_session_info *v9ses, void *buf,
 	buf_init(dbufp, (char *)stat + sizeof(struct v9fs_stat),
 		 statlen - sizeof(struct v9fs_stat));
 
-	return deserialize_stat(v9ses, bufp, stat, dbufp);
+	return deserialize_stat(bufp, stat, dbufp, extended);
 }
 
-static inline int
-v9fs_size_fcall(struct v9fs_session_info *v9ses, struct v9fs_fcall *fcall)
+static inline int v9fs_size_fcall(struct v9fs_fcall *fcall, int extended)
 {
 	int size = 4 + 1 + 2;	/* size[4] msg[1] tag[2] */
 	int i = 0;
@@ -485,7 +481,7 @@ v9fs_size_fcall(struct v9fs_session_info *v9ses, struct v9fs_fcall *fcall)
 		break;
 	case TWSTAT:		/* fid[4] stat[n] */
 		fcall->params.twstat.stat->size =
-		    v9fs_size_stat(v9ses, fcall->params.twstat.stat);
+		    v9fs_size_stat(fcall->params.twstat.stat, extended);
 		size += 4 + 2 + 2 + fcall->params.twstat.stat->size;
 	}
 	return size;
@@ -493,16 +489,16 @@ v9fs_size_fcall(struct v9fs_session_info *v9ses, struct v9fs_fcall *fcall)
 
 /*
  * v9fs_serialize_fcall - marshall fcall struct into a packet
- * @v9ses: session information
  * @fcall: structure to convert
  * @data: buffer to serialize fcall into
  * @datalen: length of buffer to serialize fcall into
+ * @extended: non-zero if 9P2000.u
  *
  */
 
 int
-v9fs_serialize_fcall(struct v9fs_session_info *v9ses, struct v9fs_fcall *fcall,
-		     void *data, u32 datalen)
+v9fs_serialize_fcall(struct v9fs_fcall *fcall, void *data, u32 datalen,
+		     int extended)
 {
 	int i = 0;
 	struct v9fs_stat *stat = NULL;
@@ -516,7 +512,7 @@ v9fs_serialize_fcall(struct v9fs_session_info *v9ses, struct v9fs_fcall *fcall,
 		return -EINVAL;
 	}
 
-	fcall->size = v9fs_size_fcall(v9ses, fcall);
+	fcall->size = v9fs_size_fcall(fcall, extended);
 
 	buf_put_int32(bufp, fcall->size);
 	buf_put_int8(bufp, fcall->id);
@@ -591,31 +587,31 @@ v9fs_serialize_fcall(struct v9fs_session_info *v9ses, struct v9fs_fcall *fcall,
 		stat = fcall->params.twstat.stat;
 
 		buf_put_int16(bufp, stat->size + 2);
-		serialize_stat(v9ses, stat, bufp);
+		serialize_stat(stat, bufp, extended);
 		break;
 	}
 
-	if (buf_check_overflow(bufp))
+	if (buf_check_overflow(bufp)) {
+		dprintk(DEBUG_ERROR, "buffer overflow\n");
 		return -EIO;
+	}
 
 	return fcall->size;
 }
 
 /**
  * deserialize_fcall - unmarshal a response
- * @v9ses: session information
- * @msgsize: size of rcall message
  * @buf: recieved buffer
  * @buflen: length of received buffer
  * @rcall: fcall structure to populate
  * @rcalllen: length of fcall structure to populate
+ * @extended: non-zero if 9P2000.u
  *
  */
 
 int
-v9fs_deserialize_fcall(struct v9fs_session_info *v9ses, u32 msgsize,
-		       void *buf, u32 buflen, struct v9fs_fcall *rcall,
-		       int rcalllen)
+v9fs_deserialize_fcall(void *buf, u32 buflen, struct v9fs_fcall *rcall,
+		       int rcalllen, int extended)
 {
 
 	struct cbuf buffer;
@@ -628,7 +624,7 @@ v9fs_deserialize_fcall(struct v9fs_session_info *v9ses, u32 msgsize,
 	buf_init(dbufp, (char *)rcall + sizeof(struct v9fs_fcall),
 		 rcalllen - sizeof(struct v9fs_fcall));
 
-	rcall->size = msgsize;
+	rcall->size = buf_get_int32(bufp);
 	rcall->id = buf_get_int8(bufp);
 	rcall->tag = buf_get_int16(bufp);
 
@@ -651,6 +647,12 @@ v9fs_deserialize_fcall(struct v9fs_session_info *v9ses, u32 msgsize,
 		break;
 	case RWALK:
 		rcall->params.rwalk.nwqid = buf_get_int16(bufp);
+		if (rcall->params.rwalk.nwqid > 16) {
+			eprintk(KERN_ERR, "Rwalk with more than 16 qids: %d\n",
+				rcall->params.rwalk.nwqid);
+			return -EPROTO;
+		}
+
 		rcall->params.rwalk.wqids = buf_alloc(dbufp,
 		      rcall->params.rwalk.nwqid * sizeof(struct v9fs_qid));
 		if (rcall->params.rwalk.wqids)
@@ -690,19 +692,21 @@ v9fs_deserialize_fcall(struct v9fs_session_info *v9ses, u32 msgsize,
 	case RSTAT:
 		buf_get_int16(bufp);
 		rcall->params.rstat.stat =
-		    deserialize_statb(v9ses, bufp, dbufp);
+		    deserialize_statb(bufp, dbufp, extended);
 		break;
 	case RWSTAT:
 		break;
 	case RERROR:
 		rcall->params.rerror.error = buf_get_stringb(bufp, dbufp);
-		if (v9ses->extended)
+		if (extended)
 			rcall->params.rerror.errno = buf_get_int16(bufp);
 		break;
 	}
 
-	if (buf_check_overflow(bufp) || buf_check_overflow(dbufp))
+	if (buf_check_overflow(bufp) || buf_check_overflow(dbufp)) {
+		dprintk(DEBUG_ERROR, "buffer overflow\n");
 		return -EIO;
+	}
 
 	return rcall->size;
 }
diff --git a/fs/9p/conv.h b/fs/9p/conv.h
index ee849613c6..d5e33e17a6 100644
--- a/fs/9p/conv.h
+++ b/fs/9p/conv.h
@@ -24,13 +24,12 @@
  *
  */
 
-int v9fs_deserialize_stat(struct v9fs_session_info *, void *buf,
-			  u32 buflen, struct v9fs_stat *stat, u32 statlen);
-int v9fs_serialize_fcall(struct v9fs_session_info *, struct v9fs_fcall *tcall,
-			 void *buf, u32 buflen);
-int v9fs_deserialize_fcall(struct v9fs_session_info *, u32 msglen,
-			   void *buf, u32 buflen, struct v9fs_fcall *rcall,
-			   int rcalllen);
+int v9fs_deserialize_stat(void *buf, u32 buflen, struct v9fs_stat *stat,
+	u32 statlen, int extended);
+int v9fs_serialize_fcall(struct v9fs_fcall *tcall, void *buf, u32 buflen,
+	int extended);
+int v9fs_deserialize_fcall(void *buf, u32 buflen, struct v9fs_fcall *rcall,
+	int rcalllen, int extended);
 
 /* this one is actually in error.c right now */
 int v9fs_errstr2errno(char *errstr);
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index d95f8626d1..60ef8aba75 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -164,7 +164,7 @@ static struct v9fs_fid *v9fs_fid_walk_up(struct dentry *dentry)
 	return v9fs_fid_create(dentry, v9ses, fidnum, 0);
 
 clunk_fid:
-	v9fs_t_clunk(v9ses, fidnum, NULL);
+	v9fs_t_clunk(v9ses, fidnum);
 	return ERR_PTR(err);
 }
 
diff --git a/fs/9p/mux.c b/fs/9p/mux.c
index 8835b576f7..62b6ad0767 100644
--- a/fs/9p/mux.c
+++ b/fs/9p/mux.c
@@ -4,7 +4,7 @@
  * Protocol Multiplexer
  *
  *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
- *  Copyright (C) 2004 by Latchesar Ionkov <lucho@ionkov.net>
+ *  Copyright (C) 2004-2005 by Latchesar Ionkov <lucho@ionkov.net>
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -28,6 +28,7 @@
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
+#include <linux/poll.h>
 #include <linux/kthread.h>
 #include <linux/idr.h>
 
@@ -38,438 +39,903 @@
 #include "conv.h"
 #include "mux.h"
 
-/**
- * dprintcond - print condition of session info
- * @v9ses: session info structure
- * @req: RPC request structure
- *
- */
+#define ERREQFLUSH	1
+#define SCHED_TIMEOUT	10
+#define MAXPOLLWADDR	2
+
+enum {
+	Rworksched = 1,		/* read work scheduled or running */
+	Rpending = 2,		/* can read */
+	Wworksched = 4,		/* write work scheduled or running */
+	Wpending = 8,		/* can write */
+};
+
+struct v9fs_mux_poll_task;
+
+struct v9fs_req {
+	int tag;
+	struct v9fs_fcall *tcall;
+	struct v9fs_fcall *rcall;
+	int err;
+	v9fs_mux_req_callback cb;
+	void *cba;
+	struct list_head req_list;
+};
+
+struct v9fs_mux_data {
+	spinlock_t lock;
+	struct list_head mux_list;
+	struct v9fs_mux_poll_task *poll_task;
+	int msize;
+	unsigned char *extended;
+	struct v9fs_transport *trans;
+	struct v9fs_idpool tidpool;
+	int err;
+	wait_queue_head_t equeue;
+	struct list_head req_list;
+	struct list_head unsent_req_list;
+	int rpos;
+	char *rbuf;
+	int wpos;
+	int wsize;
+	char *wbuf;
+	wait_queue_t poll_wait[MAXPOLLWADDR];
+	wait_queue_head_t *poll_waddr[MAXPOLLWADDR];
+	poll_table pt;
+	struct work_struct rq;
+	struct work_struct wq;
+	unsigned long wsched;
+};
+
+struct v9fs_mux_poll_task {
+	struct task_struct *task;
+	struct list_head mux_list;
+	int muxnum;
+};
+
+struct v9fs_mux_rpc {
+	struct v9fs_mux_data *m;
+	struct v9fs_req *req;
+	int err;
+	struct v9fs_fcall *rcall;
+	wait_queue_head_t wqueue;
+};
+
+static int v9fs_poll_proc(void *);
+static void v9fs_read_work(void *);
+static void v9fs_write_work(void *);
+static void v9fs_pollwait(struct file *filp, wait_queue_head_t * wait_address,
+			  poll_table * p);
+
+static DECLARE_MUTEX(v9fs_mux_task_lock);
+static struct workqueue_struct *v9fs_mux_wq;
+
+static int v9fs_mux_num;
+static int v9fs_mux_poll_task_num;
+static struct v9fs_mux_poll_task v9fs_mux_poll_tasks[100];
+
+void v9fs_mux_global_init(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(v9fs_mux_poll_tasks); i++)
+		v9fs_mux_poll_tasks[i].task = NULL;
+
+	v9fs_mux_wq = create_workqueue("v9fs");
+}
 
-static inline int
-dprintcond(struct v9fs_session_info *v9ses, struct v9fs_rpcreq *req)
+void v9fs_mux_global_exit(void)
 {
-	dprintk(DEBUG_MUX, "condition: %d, %p\n", v9ses->transport->status,
-		req->rcall);
-	return 0;
+	destroy_workqueue(v9fs_mux_wq);
 }
 
 /**
- * xread - force read of a certain number of bytes
- * @v9ses: session info structure
- * @ptr: pointer to buffer
- * @sz: number of bytes to read
+ * v9fs_mux_calc_poll_procs - calculates the number of polling procs
+ * based on the number of mounted v9fs filesystems.
  *
- * Chuck Cranor CS-533 project1
+ * The current implementation returns sqrt of the number of mounts.
  */
+inline int v9fs_mux_calc_poll_procs(int muxnum)
+{
+	int n;
+
+	if (v9fs_mux_poll_task_num)
+		n = muxnum / v9fs_mux_poll_task_num +
+		    (muxnum % v9fs_mux_poll_task_num ? 1 : 0);
+	else
+		n = 1;
+
+	if (n > ARRAY_SIZE(v9fs_mux_poll_tasks))
+		n = ARRAY_SIZE(v9fs_mux_poll_tasks);
 
-static int xread(struct v9fs_session_info *v9ses, void *ptr, unsigned long sz)
+	return n;
+}
+
+static void v9fs_mux_poll_start(struct v9fs_mux_data *m)
 {
-	int rd = 0;
-	int ret = 0;
-	while (rd < sz) {
-		ret = v9ses->transport->read(v9ses->transport, ptr, sz - rd);
-		if (ret <= 0) {
-			dprintk(DEBUG_ERROR, "xread errno %d\n", ret);
-			return ret;
+	int i, n;
+	struct v9fs_mux_poll_task *vpt, *vptlast;
+
+	dprintk(DEBUG_MUX, "mux %p muxnum %d procnum %d\n", m, v9fs_mux_num,
+		v9fs_mux_poll_task_num);
+	up(&v9fs_mux_task_lock);
+
+	n = v9fs_mux_calc_poll_procs(v9fs_mux_num + 1);
+	if (n > v9fs_mux_poll_task_num) {
+		for (i = 0; i < ARRAY_SIZE(v9fs_mux_poll_tasks); i++) {
+			if (v9fs_mux_poll_tasks[i].task == NULL) {
+				vpt = &v9fs_mux_poll_tasks[i];
+				dprintk(DEBUG_MUX, "create proc %p\n", vpt);
+				vpt->task = kthread_create(v9fs_poll_proc,
+					vpt, "v9fs-poll");
+				INIT_LIST_HEAD(&vpt->mux_list);
+				vpt->muxnum = 0;
+				v9fs_mux_poll_task_num++;
+				wake_up_process(vpt->task);
+				break;
+			}
 		}
-		rd += ret;
-		ptr += ret;
-	}
-	return (rd);
-}
 
-/**
- * read_message - read a full 9P2000 fcall packet
- * @v9ses: session info structure
- * @rcall: fcall structure to read into
- * @rcalllen: size of fcall buffer
- *
- */
+		if (i >= ARRAY_SIZE(v9fs_mux_poll_tasks))
+			dprintk(DEBUG_ERROR, "warning: no free poll slots\n");
+	}
 
-static int
-read_message(struct v9fs_session_info *v9ses,
-	     struct v9fs_fcall *rcall, int rcalllen)
-{
-	unsigned char buf[4];
-	void *data;
-	int size = 0;
-	int res = 0;
-
-	res = xread(v9ses, buf, sizeof(buf));
-	if (res < 0) {
-		dprintk(DEBUG_ERROR,
-			"Reading of count field failed returned: %d\n", res);
-		return res;
+	n = (v9fs_mux_num + 1) / v9fs_mux_poll_task_num +
+	    ((v9fs_mux_num + 1) % v9fs_mux_poll_task_num ? 1 : 0);
+
+	vptlast = NULL;
+	for (i = 0; i < ARRAY_SIZE(v9fs_mux_poll_tasks); i++) {
+		vpt = &v9fs_mux_poll_tasks[i];
+		if (vpt->task != NULL) {
+			vptlast = vpt;
+			if (vpt->muxnum < n) {
+				dprintk(DEBUG_MUX, "put in proc %d\n", i);
+				list_add(&m->mux_list, &vpt->mux_list);
+				vpt->muxnum++;
+				m->poll_task = vpt;
+				memset(&m->poll_waddr, 0, sizeof(m->poll_waddr));
+				init_poll_funcptr(&m->pt, v9fs_pollwait);
+				break;
+			}
+		}
 	}
 
-	if (res < 4) {
-		dprintk(DEBUG_ERROR,
-			"Reading of count field failed returned: %d\n", res);
-		return -EIO;
+	if (i >= ARRAY_SIZE(v9fs_mux_poll_tasks)) {
+		dprintk(DEBUG_MUX, "put in proc %d\n", i);
+		list_add(&m->mux_list, &vptlast->mux_list);
+		vptlast->muxnum++;
+		m->poll_task = vpt;
+		memset(&m->poll_waddr, 0, sizeof(m->poll_waddr));
+		init_poll_funcptr(&m->pt, v9fs_pollwait);
 	}
 
-	size = buf[0] | (buf[1] << 8) | (buf[2] << 16) | (buf[3] << 24);
-	dprintk(DEBUG_MUX, "got a packet count: %d\n", size);
+	v9fs_mux_num++;
+	down(&v9fs_mux_task_lock);
+}
 
-	/* adjust for the four bytes of size */
-	size -= 4;
+static void v9fs_mux_poll_stop(struct v9fs_mux_data *m)
+{
+	int i;
+	struct v9fs_mux_poll_task *vpt;
+
+	up(&v9fs_mux_task_lock);
+	vpt = m->poll_task;
+	list_del(&m->mux_list);
+	for(i = 0; i < ARRAY_SIZE(m->poll_waddr); i++) {
+		if (m->poll_waddr[i] != NULL) {
+			remove_wait_queue(m->poll_waddr[i], &m->poll_wait[i]);
+			m->poll_waddr[i] = NULL;
+		}
+	}
+	vpt->muxnum--;
+	if (!vpt->muxnum) {
+		dprintk(DEBUG_MUX, "destroy proc %p\n", vpt);
+		send_sig(SIGKILL, vpt->task, 1);
+		vpt->task = NULL;
+		v9fs_mux_poll_task_num--;
+	}
+	v9fs_mux_num--;
+	down(&v9fs_mux_task_lock);
+}
 
-	if (size > v9ses->maxdata) {
-		dprintk(DEBUG_ERROR, "packet too big: %d\n", size);
-		return -E2BIG;
+/**
+ * v9fs_mux_init - allocate and initialize the per-session mux data
+ * Creates the polling task if this is the first session.
+ *
+ * @trans - transport structure
+ * @msize - maximum message size
+ * @extended - pointer to the extended flag
+ */
+struct v9fs_mux_data *v9fs_mux_init(struct v9fs_transport *trans, int msize,
+				    unsigned char *extended)
+{
+	int i, n;
+	struct v9fs_mux_data *m, *mtmp;
+
+	dprintk(DEBUG_MUX, "transport %p msize %d\n", trans, msize);
+	m = kmalloc(sizeof(struct v9fs_mux_data) + 2 * msize, GFP_KERNEL);
+	if (!m)
+		return ERR_PTR(-ENOMEM);
+
+	spin_lock_init(&m->lock);
+	INIT_LIST_HEAD(&m->mux_list);
+	m->msize = msize;
+	m->extended = extended;
+	m->trans = trans;
+	idr_init(&m->tidpool.pool);
+	init_MUTEX(&m->tidpool.lock);
+	m->err = 0;
+	init_waitqueue_head(&m->equeue);
+	INIT_LIST_HEAD(&m->req_list);
+	INIT_LIST_HEAD(&m->unsent_req_list);
+	m->rpos = 0;
+	m->rbuf = (char *)m + sizeof(struct v9fs_mux_data);
+	m->wpos = m->wsize = 0;
+	m->wbuf = m->rbuf + msize;
+	INIT_WORK(&m->rq, v9fs_read_work, m);
+	INIT_WORK(&m->wq, v9fs_write_work, m);
+	m->wsched = 0;
+	memset(&m->poll_waddr, 0, sizeof(m->poll_waddr));
+	v9fs_mux_poll_start(m);
+
+	n = trans->poll(trans, &m->pt);
+	if (n & POLLIN) {
+		dprintk(DEBUG_MUX, "mux %p can read\n", m);
+		set_bit(Rpending, &m->wsched);
 	}
 
-	data = kmalloc(size, GFP_KERNEL);
-	if (!data) {
-		eprintk(KERN_WARNING, "out of memory\n");
-		return -ENOMEM;
+	if (n & POLLOUT) {
+		dprintk(DEBUG_MUX, "mux %p can write\n", m);
+		set_bit(Wpending, &m->wsched);
 	}
 
-	res = xread(v9ses, data, size);
-	if (res < size) {
-		dprintk(DEBUG_ERROR, "Reading of fcall failed returned: %d\n",
-			res);
-		kfree(data);
-		return res;
+	for(i = 0; i < ARRAY_SIZE(m->poll_waddr); i++) {
+		if (IS_ERR(m->poll_waddr[i])) {
+			v9fs_mux_poll_stop(m);
+			mtmp = (void *)m->poll_waddr;	/* the error code */
+			kfree(m);
+			m = mtmp;
+			break;
+		}
 	}
 
-	/* we now have an in-memory string that is the reply.
-	 * deserialize it. There is very little to go wrong at this point
-	 * save for v9fs_alloc errors.
-	 */
-	res = v9fs_deserialize_fcall(v9ses, size, data, v9ses->maxdata,
-				     rcall, rcalllen);
+	return m;
+}
 
-	kfree(data);
+/**
+ * v9fs_mux_destroy - cancels all pending requests and frees mux resources
+ */
+void v9fs_mux_destroy(struct v9fs_mux_data *m)
+{
+	dprintk(DEBUG_MUX, "mux %p prev %p next %p\n", m,
+		m->mux_list.prev, m->mux_list.next);
+	v9fs_mux_cancel(m, -ECONNRESET);
+
+	if (!list_empty(&m->req_list)) {
+		/* wait until all processes waiting on this session exit */
+		dprintk(DEBUG_MUX, "mux %p waiting for empty request queue\n",
+			m);
+		wait_event_timeout(m->equeue, (list_empty(&m->req_list)), 5000);
+		dprintk(DEBUG_MUX, "mux %p request queue empty: %d\n", m,
+			list_empty(&m->req_list));
+	}
 
-	if (res < 0)
-		return res;
+	v9fs_mux_poll_stop(m);
+	m->trans = NULL;
 
-	return 0;
+	kfree(m);
 }
 
 /**
- * v9fs_recv - receive an RPC response for a particular tag
- * @v9ses: session info structure
- * @req: RPC request structure
- *
+ * v9fs_pollwait - called by files poll operation to add v9fs-poll task
+ * 	to files wait queue
  */
-
-static int v9fs_recv(struct v9fs_session_info *v9ses, struct v9fs_rpcreq *req)
+static void
+v9fs_pollwait(struct file *filp, wait_queue_head_t * wait_address,
+	      poll_table * p)
 {
-	int ret = 0;
-
-	dprintk(DEBUG_MUX, "waiting for response: %d\n", req->tcall->tag);
-	ret = wait_event_interruptible(v9ses->read_wait,
-		       ((v9ses->transport->status != Connected) ||
-			(req->rcall != 0) || (req->err < 0) ||
-			dprintcond(v9ses, req)));
+	int i;
+	struct v9fs_mux_data *m;
 
-	dprintk(DEBUG_MUX, "got it: rcall %p\n", req->rcall);
+	m = container_of(p, struct v9fs_mux_data, pt);
+	for(i = 0; i < ARRAY_SIZE(m->poll_waddr); i++)
+		if (m->poll_waddr[i] == NULL)
+			break;
 
-	spin_lock(&v9ses->muxlock);
-	list_del(&req->next);
-	spin_unlock(&v9ses->muxlock);
+	if (i >= ARRAY_SIZE(m->poll_waddr)) {
+		dprintk(DEBUG_ERROR, "not enough wait_address slots\n");
+		return;
+	}
 
-	if (req->err < 0)
-		return req->err;
+	m->poll_waddr[i] = wait_address;
 
-	if (v9ses->transport->status == Disconnected)
-		return -ECONNRESET;
+	if (!wait_address) {
+		dprintk(DEBUG_ERROR, "no wait_address\n");
+		m->poll_waddr[i] = ERR_PTR(-EIO);
+		return;
+	}
 
-	return ret;
+	init_waitqueue_entry(&m->poll_wait[i], m->poll_task->task);
+	add_wait_queue(wait_address, &m->poll_wait[i]);
 }
 
 /**
- * v9fs_send - send a 9P request
- * @v9ses: session info structure
- * @req: RPC request to send
- *
+ * v9fs_poll_mux - polls a mux and schedules read or write works if necessary
  */
-
-static int v9fs_send(struct v9fs_session_info *v9ses, struct v9fs_rpcreq *req)
+static inline void v9fs_poll_mux(struct v9fs_mux_data *m)
 {
-	int ret = -1;
-	void *data = NULL;
-	struct v9fs_fcall *tcall = req->tcall;
-
-	data = kmalloc(v9ses->maxdata + V9FS_IOHDRSZ, GFP_KERNEL);
-	if (!data)
-		return -ENOMEM;
-
-	tcall->size = 0;	/* enforce size recalculation */
-	ret =
-	    v9fs_serialize_fcall(v9ses, tcall, data,
-				 v9ses->maxdata + V9FS_IOHDRSZ);
-	if (ret < 0)
-		goto free_data;
-
-	spin_lock(&v9ses->muxlock);
-	list_add(&req->next, &v9ses->mux_fcalls);
-	spin_unlock(&v9ses->muxlock);
-
-	dprintk(DEBUG_MUX, "sending message: tag %d size %d\n", tcall->tag,
-		tcall->size);
-	ret = v9ses->transport->write(v9ses->transport, data, tcall->size);
-
-	if (ret != tcall->size) {
-		spin_lock(&v9ses->muxlock);
-		list_del(&req->next);
-		kfree(req->rcall);
+	int n;
 
-		spin_unlock(&v9ses->muxlock);
-		if (ret >= 0)
-			ret = -EREMOTEIO;
-	} else
-		ret = 0;
+	if (m->err < 0)
+		return;
+
+	n = m->trans->poll(m->trans, NULL);
+	if (n < 0 || n & (POLLERR | POLLHUP | POLLNVAL)) {
+		dprintk(DEBUG_MUX, "error mux %p err %d\n", m, n);
+		if (n >= 0)
+			n = -ECONNRESET;
+		v9fs_mux_cancel(m, n);
+	}
+
+	if (n & POLLIN) {
+		set_bit(Rpending, &m->wsched);
+		dprintk(DEBUG_MUX, "mux %p can read\n", m);
+		if (!test_and_set_bit(Rworksched, &m->wsched)) {
+			dprintk(DEBUG_MUX, "schedule read work mux %p\n", m);
+			queue_work(v9fs_mux_wq, &m->rq);
+		}
+	}
 
-      free_data:
-	kfree(data);
-	return ret;
+	if (n & POLLOUT) {
+		set_bit(Wpending, &m->wsched);
+		dprintk(DEBUG_MUX, "mux %p can write\n", m);
+		if ((m->wsize || !list_empty(&m->unsent_req_list))
+		    && !test_and_set_bit(Wworksched, &m->wsched)) {
+			dprintk(DEBUG_MUX, "schedule write work mux %p\n", m);
+			queue_work(v9fs_mux_wq, &m->wq);
+		}
+	}
 }
 
 /**
- * v9fs_mux_rpc - send a request, receive a response
- * @v9ses: session info structure
- * @tcall: fcall to send
- * @rcall: buffer to place response into
- *
+ * v9fs_poll_proc - polls all v9fs transports for new events and queues
+ * 	the appropriate work to the work queue
  */
-
-long
-v9fs_mux_rpc(struct v9fs_session_info *v9ses, struct v9fs_fcall *tcall,
-	     struct v9fs_fcall **rcall)
+static int v9fs_poll_proc(void *a)
 {
-	int tid = -1;
-	struct v9fs_fcall *fcall = NULL;
-	struct v9fs_rpcreq req;
-	int ret = -1;
+	struct v9fs_mux_data *m, *mtmp;
+	struct v9fs_mux_poll_task *vpt;
 
-	if (!v9ses)
-		return -EINVAL;
+	vpt = a;
+	dprintk(DEBUG_MUX, "start %p %p\n", current, vpt);
+	allow_signal(SIGKILL);
+	while (!kthread_should_stop()) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (signal_pending(current))
+			break;
 
-	if (!v9ses->transport || v9ses->transport->status != Connected)
-		return -EIO;
+		list_for_each_entry_safe(m, mtmp, &vpt->mux_list, mux_list) {
+			v9fs_poll_mux(m);
+		}
+
+		dprintk(DEBUG_MUX, "sleeping...\n");
+		schedule_timeout(SCHED_TIMEOUT * HZ);
+	}
 
-	if (rcall)
-		*rcall = NULL;
+	__set_current_state(TASK_RUNNING);
+	dprintk(DEBUG_MUX, "finish\n");
+	return 0;
+}
 
-	if (tcall->id != TVERSION) {
-		tid = v9fs_get_idpool(&v9ses->tidpool);
-		if (tid < 0)
-			return -ENOMEM;
+static inline int v9fs_write_req(struct v9fs_mux_data *m, struct v9fs_req *req)
+{
+	int n;
+
+	list_move_tail(&req->req_list, &m->req_list);
+	n = v9fs_serialize_fcall(req->tcall, m->wbuf, m->msize, *m->extended);
+	if (n < 0) {
+		req->err = n;
+		list_del(&req->req_list);
+		if (req->cb) {
+			spin_unlock(&m->lock);
+			(*req->cb) (req->cba, req->tcall, req->rcall, req->err);
+			req->cb = NULL;
+			spin_lock(&m->lock);
+		} else
+			kfree(req->rcall);
+
+		kfree(req);
 	}
 
-	tcall->tag = tid;
+	return n;
+}
 
-	req.tcall = tcall;
-	req.err = 0;
-	req.rcall = NULL;
+/**
+ * v9fs_write_work - called when a transport can send some data
+ */
+static void v9fs_write_work(void *a)
+{
+	int n, err;
+	struct v9fs_mux_data *m;
+	struct v9fs_req *req, *rtmp;
 
-	ret = v9fs_send(v9ses, &req);
+	m = a;
 
-	if (ret < 0) {
-		if (tcall->id != TVERSION)
-			v9fs_put_idpool(tid, &v9ses->tidpool);
-		dprintk(DEBUG_MUX, "error %d\n", ret);
-		return ret;
+	if (m->err < 0) {
+		clear_bit(Wworksched, &m->wsched);
+		return;
 	}
 
-	ret = v9fs_recv(v9ses, &req);
-
-	fcall = req.rcall;
-
-	dprintk(DEBUG_MUX, "received: tag=%x, ret=%d\n", tcall->tag, ret);
-	if (ret == -ERESTARTSYS) {
-		if (v9ses->transport->status != Disconnected
-		    && tcall->id != TFLUSH) {
-			unsigned long flags;
-
-			dprintk(DEBUG_MUX, "flushing the tag: %d\n",
-				tcall->tag);
-			clear_thread_flag(TIF_SIGPENDING);
-			v9fs_t_flush(v9ses, tcall->tag);
-			spin_lock_irqsave(&current->sighand->siglock, flags);
-			recalc_sigpending();
-			spin_unlock_irqrestore(&current->sighand->siglock,
-					       flags);
-			dprintk(DEBUG_MUX, "flushing done\n");
+	if (!m->wsize) {
+		if (list_empty(&m->unsent_req_list)) {
+			clear_bit(Wworksched, &m->wsched);
+			return;
 		}
 
-		goto release_req;
-	} else if (ret < 0)
-		goto release_req;
-
-	if (!fcall)
-		ret = -EIO;
-	else {
-		if (fcall->id == RERROR) {
-			ret = v9fs_errstr2errno(fcall->params.rerror.error);
-			if (ret == 0) {	/* string match failed */
-				if (fcall->params.rerror.errno)
-					ret = -(fcall->params.rerror.errno);
-				else
-					ret = -ESERVERFAULT;
-			}
-		} else if (fcall->id != tcall->id + 1) {
-			dprintk(DEBUG_ERROR,
-				"fcall mismatch: expected %d, got %d\n",
-				tcall->id + 1, fcall->id);
-			ret = -EIO;
+		err = 0;
+		spin_lock(&m->lock);
+		list_for_each_entry_safe(req, rtmp, &m->unsent_req_list,
+					 req_list) {
+			err = v9fs_write_req(m, req);
+			if (err > 0)
+				break;
 		}
+
+		m->wsize = err;
+		m->wpos = 0;
+		spin_unlock(&m->lock);
 	}
 
-      release_req:
-	if (tcall->id != TVERSION)
-		v9fs_put_idpool(tid, &v9ses->tidpool);
-	if (rcall)
-		*rcall = fcall;
-	else
-		kfree(fcall);
+	dprintk(DEBUG_MUX, "mux %p pos %d size %d\n", m, m->wpos, m->wsize);
+	clear_bit(Wpending, &m->wsched);
+	err = m->trans->write(m->trans, m->wbuf + m->wpos, m->wsize - m->wpos);
+	dprintk(DEBUG_MUX, "mux %p sent %d bytes\n", m, err);
+	if (err == -EAGAIN) {
+		clear_bit(Wworksched, &m->wsched);
+		return;
+	}
+
+	if (err <= 0)
+		goto error;
+
+	m->wpos += err;
+	if (m->wpos == m->wsize)
+		m->wpos = m->wsize = 0;
+
+	if (m->wsize == 0 && !list_empty(&m->unsent_req_list)) {
+		if (test_and_clear_bit(Wpending, &m->wsched))
+			n = POLLOUT;
+		else
+			n = m->trans->poll(m->trans, NULL);
+
+		if (n & POLLOUT) {
+			dprintk(DEBUG_MUX, "schedule write work mux %p\n", m);
+			queue_work(v9fs_mux_wq, &m->wq);
+		} else
+			clear_bit(Wworksched, &m->wsched);
+	} else
+		clear_bit(Wworksched, &m->wsched);
 
-	return ret;
+	return;
+
+      error:
+	v9fs_mux_cancel(m, err);
+	clear_bit(Wworksched, &m->wsched);
 }
 
-/**
- * v9fs_mux_cancel_requests - cancels all pending requests
- *
- * @v9ses: session info structure
- * @err: error code to return to the requests
- */
-void v9fs_mux_cancel_requests(struct v9fs_session_info *v9ses, int err)
+static void process_request(struct v9fs_mux_data *m, struct v9fs_req *req)
 {
-	struct v9fs_rpcreq *rptr;
-	struct v9fs_rpcreq *rreq;
+	int ecode, tag;
+	char *ename;
+
+	tag = req->tag;
+	if (req->rcall->id == RERROR && !req->err) {
+		ecode = req->rcall->params.rerror.errno;
+		ename = req->rcall->params.rerror.error;
 
-	dprintk(DEBUG_MUX, " %d\n", err);
-	spin_lock(&v9ses->muxlock);
-	list_for_each_entry_safe(rreq, rptr, &v9ses->mux_fcalls, next) {
-		rreq->err = err;
+		dprintk(DEBUG_MUX, "Rerror %s\n", ename);
+
+		if (*m->extended)
+			req->err = -ecode;
+
+		if (!req->err) {
+			req->err = v9fs_errstr2errno(ename);
+
+			if (!req->err) {	/* string match failed */
+				dprintk(DEBUG_ERROR, "unknown error: %s\n",
+					ename);
+			}
+
+			if (!req->err)
+				req->err = -ESERVERFAULT;
+		}
+	} else if (req->tcall && req->rcall->id != req->tcall->id + 1) {
+		dprintk(DEBUG_ERROR, "fcall mismatch: expected %d, got %d\n",
+			req->tcall->id + 1, req->rcall->id);
+		if (!req->err)
+			req->err = -EIO;
 	}
-	spin_unlock(&v9ses->muxlock);
-	wake_up_all(&v9ses->read_wait);
+
+	if (req->cb && req->err != ERREQFLUSH) {
+		dprintk(DEBUG_MUX, "calling callback tcall %p rcall %p\n",
+			req->tcall, req->rcall);
+
+		(*req->cb) (req->cba, req->tcall, req->rcall, req->err);
+		req->cb = NULL;
+	} else
+		kfree(req->rcall);
+
+	if (tag != V9FS_NOTAG)
+		v9fs_put_idpool(tag, &m->tidpool);
+
+	wake_up(&m->equeue);
+	kfree(req);
 }
 
 /**
- * v9fs_recvproc - kproc to handle demultiplexing responses
- * @data: session info structure
- *
+ * v9fs_read_work - called when there is some data to be read from a transport
  */
-
-static int v9fs_recvproc(void *data)
+static void v9fs_read_work(void *a)
 {
-	struct v9fs_session_info *v9ses = (struct v9fs_session_info *)data;
-	struct v9fs_fcall *rcall = NULL;
-	struct v9fs_rpcreq *rptr;
-	struct v9fs_rpcreq *req;
-	struct v9fs_rpcreq *rreq;
-	int err = 0;
+	int n, err, rcallen;
+	struct v9fs_mux_data *m;
+	struct v9fs_req *req, *rptr, *rreq;
+	struct v9fs_fcall *rcall;
+
+	m = a;
+
+	if (m->err < 0)
+		return;
+
+	rcall = NULL;
+	dprintk(DEBUG_MUX, "start mux %p pos %d\n", m, m->rpos);
+	clear_bit(Rpending, &m->wsched);
+	err = m->trans->read(m->trans, m->rbuf + m->rpos, m->msize - m->rpos);
+	dprintk(DEBUG_MUX, "mux %p got %d bytes\n", m, err);
+	if (err == -EAGAIN) {
+		clear_bit(Rworksched, &m->wsched);
+		return;
+	}
 
-	allow_signal(SIGKILL);
-	set_current_state(TASK_INTERRUPTIBLE);
-	complete(&v9ses->proccmpl);
-	while (!kthread_should_stop() && err >= 0) {
-		req = rptr = rreq = NULL;
+	if (err <= 0)
+		goto error;
 
-		rcall = kmalloc(v9ses->maxdata + V9FS_IOHDRSZ, GFP_KERNEL);
-		if (!rcall) {
-			eprintk(KERN_ERR, "no memory for buffers\n");
+	m->rpos += err;
+	while (m->rpos > 4) {
+		n = le32_to_cpu(*(__le32 *) m->rbuf);
+		if (n >= m->msize) {
+			dprintk(DEBUG_ERROR,
+				"requested packet size too big: %d\n", n);
+			err = -EIO;
+			goto error;
+		}
+
+		if (m->rpos < n)
 			break;
+
+		rcallen = n + V9FS_FCALLHDRSZ;
+		rcall = kmalloc(rcallen, GFP_KERNEL);
+		if (!rcall) {
+			err = -ENOMEM;
+			goto error;
 		}
 
-		err = read_message(v9ses, rcall, v9ses->maxdata + V9FS_IOHDRSZ);
-		spin_lock(&v9ses->muxlock);
+		dump_data(m->rbuf, n);
+		err = v9fs_deserialize_fcall(m->rbuf, n, rcall, rcallen,
+					     *m->extended);
 		if (err < 0) {
-			list_for_each_entry_safe(rreq, rptr, &v9ses->mux_fcalls, next) {
-				rreq->err = err;
-			}
-			if(err != -ERESTARTSYS)
-				eprintk(KERN_ERR,
-					"Transport error while reading message %d\n", err);
-		} else {
-			list_for_each_entry_safe(rreq, rptr, &v9ses->mux_fcalls, next) {
-				if (rreq->tcall->tag == rcall->tag) {
-					req = rreq;
-					req->rcall = rcall;
-					break;
-				}
-			}
+			kfree(rcall);
+			goto error;
 		}
 
-		if (req && (req->tcall->id == TFLUSH)) {
-			struct v9fs_rpcreq *treq = NULL;
-			list_for_each_entry_safe(treq, rptr, &v9ses->mux_fcalls, next) {
-				if (treq->tcall->tag ==
-				    req->tcall->params.tflush.oldtag) {
-					list_del(&rptr->next);
-					kfree(treq->rcall);
-					break;
-				}
+		dprintk(DEBUG_MUX, "mux %p fcall id %d tag %d\n", m, rcall->id,
+			rcall->tag);
+
+		req = NULL;
+		spin_lock(&m->lock);
+		list_for_each_entry_safe(rreq, rptr, &m->req_list, req_list) {
+			if (rreq->tag == rcall->tag) {
+				req = rreq;
+				req->rcall = rcall;
+				list_del(&req->req_list);
+				spin_unlock(&m->lock);
+				process_request(m, req);
+				break;
 			}
 		}
 
-		spin_unlock(&v9ses->muxlock);
-
 		if (!req) {
-			if (err >= 0)
+			spin_unlock(&m->lock);
+			if (err >= 0 && rcall->id != RFLUSH)
 				dprintk(DEBUG_ERROR,
-					"unexpected response: id %d tag %d\n",
-					rcall->id, rcall->tag);
-
+					"unexpected response mux %p id %d tag %d\n",
+					m, rcall->id, rcall->tag);
 			kfree(rcall);
 		}
 
-		wake_up_all(&v9ses->read_wait);
-		set_current_state(TASK_INTERRUPTIBLE);
+		if (m->rpos > n)
+			memmove(m->rbuf, m->rbuf + n, m->rpos - n);
+		m->rpos -= n;
 	}
 
-	v9ses->transport->close(v9ses->transport);
-
-	/* Inform all pending processes about the failure */
-	wake_up_all(&v9ses->read_wait);
-
-	if (signal_pending(current))
-		complete(&v9ses->proccmpl);
+	if (!list_empty(&m->req_list)) {
+		if (test_and_clear_bit(Rpending, &m->wsched))
+			n = POLLIN;
+		else
+			n = m->trans->poll(m->trans, NULL);
+
+		if (n & POLLIN) {
+			dprintk(DEBUG_MUX, "schedule read work mux %p\n", m);
+			queue_work(v9fs_mux_wq, &m->rq);
+		} else
+			clear_bit(Rworksched, &m->wsched);
+	} else
+		clear_bit(Rworksched, &m->wsched);
 
-	dprintk(DEBUG_MUX, "recvproc: end\n");
-	v9ses->recvproc = NULL;
+	return;
 
-	return err >= 0;
+      error:
+	v9fs_mux_cancel(m, err);
+	clear_bit(Rworksched, &m->wsched);
 }
 
 /**
- * v9fs_mux_init - initialize multiplexer (spawn kproc)
- * @v9ses: session info structure
- * @dev_name: mount device information (to create unique kproc)
+ * v9fs_send_request - send 9P request
+ * The function can sleep until the request is scheduled for sending.
+ * The function can be interrupted. Return from the function is not
+ * a guarantee that the request is sent succesfully. Can return errors
+ * that can be retrieved by PTR_ERR macros.
  *
+ * @m: mux data
+ * @tc: request to be sent
+ * @cb: callback function to call when response is received
+ * @cba: parameter to pass to the callback function
  */
+static struct v9fs_req *v9fs_send_request(struct v9fs_mux_data *m,
+					  struct v9fs_fcall *tc,
+					  v9fs_mux_req_callback cb, void *cba)
+{
+	int n;
+	struct v9fs_req *req;
+
+	dprintk(DEBUG_MUX, "mux %p task %p tcall %p id %d\n", m, current,
+		tc, tc->id);
+	if (m->err < 0)
+		return ERR_PTR(m->err);
+
+	req = kmalloc(sizeof(struct v9fs_req), GFP_KERNEL);
+	if (!req)
+		return ERR_PTR(-ENOMEM);
 
-int v9fs_mux_init(struct v9fs_session_info *v9ses, const char *dev_name)
+	if (tc->id == TVERSION)
+		n = V9FS_NOTAG;
+	else
+		n = v9fs_get_idpool(&m->tidpool);
+
+	if (n < 0)
+		return ERR_PTR(-ENOMEM);
+
+	tc->tag = n;
+	req->tag = n;
+	req->tcall = tc;
+	req->rcall = NULL;
+	req->err = 0;
+	req->cb = cb;
+	req->cba = cba;
+
+	spin_lock(&m->lock);
+	list_add_tail(&req->req_list, &m->unsent_req_list);
+	spin_unlock(&m->lock);
+
+	if (test_and_clear_bit(Wpending, &m->wsched))
+		n = POLLOUT;
+	else
+		n = m->trans->poll(m->trans, NULL);
+
+	if (n & POLLOUT && !test_and_set_bit(Wworksched, &m->wsched))
+		queue_work(v9fs_mux_wq, &m->wq);
+
+	return req;
+}
+
+static inline void
+v9fs_mux_flush_cb(void *a, struct v9fs_fcall *tc, struct v9fs_fcall *rc,
+		  int err)
 {
-	char procname[60];
-
-	strncpy(procname, dev_name, sizeof(procname));
-	procname[sizeof(procname) - 1] = 0;
-
-	init_waitqueue_head(&v9ses->read_wait);
-	init_completion(&v9ses->fcread);
-	init_completion(&v9ses->proccmpl);
-	spin_lock_init(&v9ses->muxlock);
-	INIT_LIST_HEAD(&v9ses->mux_fcalls);
-	v9ses->recvproc = NULL;
-	v9ses->curfcall = NULL;
-
-	v9ses->recvproc = kthread_create(v9fs_recvproc, v9ses,
-					 "v9fs_recvproc %s", procname);
-
-	if (IS_ERR(v9ses->recvproc)) {
-		eprintk(KERN_ERR, "cannot create receiving thread\n");
-		v9fs_session_close(v9ses);
-		return -ECONNABORTED;
+	v9fs_mux_req_callback cb;
+	int tag;
+	struct v9fs_mux_data *m;
+	struct v9fs_req *req, *rptr;
+
+	m = a;
+	dprintk(DEBUG_MUX, "mux %p tc %p rc %p err %d oldtag %d\n", m, tc,
+		rc, err, tc->params.tflush.oldtag);
+
+	spin_lock(&m->lock);
+	cb = NULL;
+	tag = tc->params.tflush.oldtag;
+	list_for_each_entry_safe(req, rptr, &m->req_list, req_list) {
+		if (req->tag == tag) {
+			list_del(&req->req_list);
+			if (req->cb) {
+				cb = req->cb;
+				req->cb = NULL;
+				spin_unlock(&m->lock);
+				(*cb) (req->cba, req->tcall, req->rcall,
+				       req->err);
+			}
+			kfree(req);
+			wake_up(&m->equeue);
+			break;
+		}
+	}
+
+	if (!cb)
+		spin_unlock(&m->lock);
+
+	if (v9fs_check_idpool(tag, &m->tidpool))
+		v9fs_put_idpool(tag, &m->tidpool);
+
+	kfree(tc);
+	kfree(rc);
+}
+
+static void
+v9fs_mux_flush_request(struct v9fs_mux_data *m, struct v9fs_req *req)
+{
+	struct v9fs_fcall *fc;
+
+	dprintk(DEBUG_MUX, "mux %p req %p tag %d\n", m, req, req->tag);
+
+	fc = kmalloc(sizeof(struct v9fs_fcall), GFP_KERNEL);
+	fc->id = TFLUSH;
+	fc->params.tflush.oldtag = req->tag;
+
+	v9fs_send_request(m, fc, v9fs_mux_flush_cb, m);
+}
+
+static void
+v9fs_mux_rpc_cb(void *a, struct v9fs_fcall *tc, struct v9fs_fcall *rc, int err)
+{
+	struct v9fs_mux_rpc *r;
+
+	if (err == ERREQFLUSH) {
+		dprintk(DEBUG_MUX, "err req flush\n");
+		return;
+	}
+
+	r = a;
+	dprintk(DEBUG_MUX, "mux %p req %p tc %p rc %p err %d\n", r->m, r->req,
+		tc, rc, err);
+	r->rcall = rc;
+	r->err = err;
+	wake_up(&r->wqueue);
+}
+
+/**
+ * v9fs_mux_rpc - sends 9P request and waits until a response is available.
+ *	The function can be interrupted.
+ * @m: mux data
+ * @tc: request to be sent
+ * @rc: pointer where a pointer to the response is stored
+ */
+int
+v9fs_mux_rpc(struct v9fs_mux_data *m, struct v9fs_fcall *tc,
+	     struct v9fs_fcall **rc)
+{
+	int err;
+	unsigned long flags;
+	struct v9fs_req *req;
+	struct v9fs_mux_rpc r;
+
+	r.err = 0;
+	r.rcall = NULL;
+	r.m = m;
+	init_waitqueue_head(&r.wqueue);
+
+	if (rc)
+		*rc = NULL;
+
+	req = v9fs_send_request(m, tc, v9fs_mux_rpc_cb, &r);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		dprintk(DEBUG_MUX, "error %d\n", err);
+		return PTR_ERR(req);
+	}
+
+	r.req = req;
+	dprintk(DEBUG_MUX, "mux %p tc %p tag %d rpc %p req %p\n", m, tc,
+		req->tag, &r, req);
+	err = wait_event_interruptible(r.wqueue, r.rcall != NULL || r.err < 0);
+	if (r.err < 0)
+		err = r.err;
+
+	if (err == -ERESTARTSYS && m->trans->status == Connected && m->err == 0) {
+		spin_lock(&m->lock);
+		req->tcall = NULL;
+		req->err = ERREQFLUSH;
+		spin_unlock(&m->lock);
+
+		clear_thread_flag(TIF_SIGPENDING);
+		v9fs_mux_flush_request(m, req);
+		spin_lock_irqsave(&current->sighand->siglock, flags);
+		recalc_sigpending();
+		spin_unlock_irqrestore(&current->sighand->siglock, flags);
 	}
 
-	wake_up_process(v9ses->recvproc);
-	wait_for_completion(&v9ses->proccmpl);
+	if (!err) {
+		if (r.rcall)
+			dprintk(DEBUG_MUX, "got response id %d tag %d\n",
+				r.rcall->id, r.rcall->tag);
+
+		if (rc)
+			*rc = r.rcall;
+		else
+			kfree(r.rcall);
+	} else {
+		kfree(r.rcall);
+		dprintk(DEBUG_MUX, "got error %d\n", err);
+		if (err > 0)
+			err = -EIO;
+	}
+
+	return err;
+}
+
+/**
+ * v9fs_mux_rpcnb - sends 9P request without waiting for response.
+ * @m: mux data
+ * @tc: request to be sent
+ * @cb: callback function to be called when response arrives
+ * @cba: value to pass to the callback function
+ */
+int v9fs_mux_rpcnb(struct v9fs_mux_data *m, struct v9fs_fcall *tc,
+		   v9fs_mux_req_callback cb, void *a)
+{
+	int err;
+	struct v9fs_req *req;
+
+	req = v9fs_send_request(m, tc, cb, a);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		dprintk(DEBUG_MUX, "error %d\n", err);
+		return PTR_ERR(req);
+	}
 
+	dprintk(DEBUG_MUX, "mux %p tc %p tag %d\n", m, tc, req->tag);
 	return 0;
 }
+
+/**
+ * v9fs_mux_cancel - cancel all pending requests with error
+ * @m: mux data
+ * @err: error code
+ */
+void v9fs_mux_cancel(struct v9fs_mux_data *m, int err)
+{
+	struct v9fs_req *req, *rtmp;
+	LIST_HEAD(cancel_list);
+
+	dprintk(DEBUG_MUX, "mux %p err %d\n", m, err);
+	m->err = err;
+	spin_lock(&m->lock);
+	list_for_each_entry_safe(req, rtmp, &m->req_list, req_list) {
+		list_move(&req->req_list, &cancel_list);
+	}
+	spin_unlock(&m->lock);
+
+	list_for_each_entry_safe(req, rtmp, &cancel_list, req_list) {
+		list_del(&req->req_list);
+		if (!req->err)
+			req->err = err;
+
+		if (req->cb)
+			(*req->cb) (req->cba, req->tcall, req->rcall, req->err);
+		else
+			kfree(req->rcall);
+
+		kfree(req);
+	}
+
+	wake_up(&m->equeue);
+}
diff --git a/fs/9p/mux.h b/fs/9p/mux.h
index 4994cb10ba..02b13b14b0 100644
--- a/fs/9p/mux.h
+++ b/fs/9p/mux.h
@@ -3,6 +3,7 @@
  *
  * Multiplexer Definitions
  *
+ *  Copyright (C) 2005 by Latchesar Ionkov <lucho@ionkov.net>
  *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
  *
  *  This program is free software; you can redistribute it and/or modify
@@ -23,19 +24,34 @@
  *
  */
 
-/* structure to manage each RPC transaction */
+struct v9fs_mux_data;
 
-struct v9fs_rpcreq {
-	struct v9fs_fcall *tcall;
-	struct v9fs_fcall *rcall;
-	int err;	/* error code if response failed */
+/**
+ * v9fs_mux_req_callback - callback function that is called when the
+ * response of a request is received. The callback is called from
+ * a workqueue and shouldn't block.
+ *
+ * @a - the pointer that was specified when the request was send to be
+ *      passed to the callback
+ * @tc - request call
+ * @rc - response call
+ * @err - error code (non-zero if error occured)
+ */
+typedef void (*v9fs_mux_req_callback)(void *a, struct v9fs_fcall *tc,
+	struct v9fs_fcall *rc, int err);
+
+void v9fs_mux_global_init(void);
+void v9fs_mux_global_exit(void);
 
-	/* XXX - could we put scatter/gather buffers here? */
+struct v9fs_mux_data *v9fs_mux_init(struct v9fs_transport *trans, int msize,
+	unsigned char *extended);
+void v9fs_mux_destroy(struct v9fs_mux_data *);
 
-	struct list_head next;
-};
+int v9fs_mux_send(struct v9fs_mux_data *m, struct v9fs_fcall *tc);
+struct v9fs_fcall *v9fs_mux_recv(struct v9fs_mux_data *m);
+int v9fs_mux_rpc(struct v9fs_mux_data *m, struct v9fs_fcall *tc, struct v9fs_fcall **rc);
+int v9fs_mux_rpcnb(struct v9fs_mux_data *m, struct v9fs_fcall *tc,
+	v9fs_mux_req_callback cb, void *a);
 
-int v9fs_mux_init(struct v9fs_session_info *v9ses, const char *dev_name);
-long v9fs_mux_rpc(struct v9fs_session_info *v9ses,
-		  struct v9fs_fcall *tcall, struct v9fs_fcall **rcall);
-void v9fs_mux_cancel_requests(struct v9fs_session_info *v9ses, int err);
+void v9fs_mux_flush(struct v9fs_mux_data *m, int sendflush);
+void v9fs_mux_cancel(struct v9fs_mux_data *m, int err);
diff --git a/fs/9p/trans_fd.c b/fs/9p/trans_fd.c
index b7ffb98595..1a28ef97a3 100644
--- a/fs/9p/trans_fd.c
+++ b/fs/9p/trans_fd.c
@@ -3,6 +3,7 @@
  *
  * File Descriptor Transport Layer
  *
+ *  Copyright (C) 2005 by Latchesar Ionkov <lucho@ionkov.net>
  *  Copyright (C) 2005 by Eric Van Hensbergen <ericvh@gmail.com>
  *
  *  This program is free software; you can redistribute it and/or modify
@@ -106,9 +107,6 @@ v9fs_fd_init(struct v9fs_session_info *v9ses, const char *addr, char *data)
 		return -ENOPROTOOPT;
 	}
 
-	sema_init(&trans->writelock, 1);
-	sema_init(&trans->readlock, 1);
-
 	ts = kmalloc(sizeof(struct v9fs_trans_fd), GFP_KERNEL);
 
 	if (!ts)
@@ -163,10 +161,55 @@ static void v9fs_fd_close(struct v9fs_transport *trans)
 	kfree(ts);
 }
 
+static unsigned int
+v9fs_fd_poll(struct v9fs_transport *trans, struct poll_table_struct *pt)
+{
+	int ret, n;
+	struct v9fs_trans_fd *ts;
+	mm_segment_t oldfs;
+
+	if (!trans)
+		return -EIO;
+
+	ts = trans->priv;
+	if (trans->status != Connected || !ts)
+		return -EIO;
+
+	oldfs = get_fs();
+	set_fs(get_ds());
+
+	if (!ts->in_file->f_op || !ts->in_file->f_op->poll) {
+		ret = -EIO;
+		goto end;
+	}
+
+	ret = ts->in_file->f_op->poll(ts->in_file, pt);
+
+	if (ts->out_file != ts->in_file) {
+		if (!ts->out_file->f_op || !ts->out_file->f_op->poll) {
+			ret = -EIO;
+			goto end;
+		}
+
+		n = ts->out_file->f_op->poll(ts->out_file, pt);
+
+		ret &= ~POLLOUT;
+		n &= ~POLLIN;
+
+		ret |= n;
+	}
+
+end:
+	set_fs(oldfs);
+	return ret;
+}
+
+
 struct v9fs_transport v9fs_trans_fd = {
 	.init = v9fs_fd_init,
 	.write = v9fs_fd_send,
 	.read = v9fs_fd_recv,
 	.close = v9fs_fd_close,
+	.poll = v9fs_fd_poll,
 };
 
diff --git a/fs/9p/trans_sock.c b/fs/9p/trans_sock.c
index 6a9a75d40f..9ef404c75c 100644
--- a/fs/9p/trans_sock.c
+++ b/fs/9p/trans_sock.c
@@ -3,6 +3,7 @@
  *
  * Socket Transport Layer
  *
+ *  Copyright (C) 2004-2005 by Latchesar Ionkov <lucho@ionkov.net>
  *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
  *  Copyright (C) 1997-2002 by Ron Minnich <rminnich@sarnoff.com>
  *  Copyright (C) 1995, 1996 by Olaf Kirch <okir@monad.swb.de>
@@ -36,6 +37,7 @@
 #include <asm/uaccess.h>
 #include <linux/inet.h>
 #include <linux/idr.h>
+#include <linux/file.h>
 
 #include "debug.h"
 #include "v9fs.h"
@@ -45,6 +47,7 @@
 
 struct v9fs_trans_sock {
 	struct socket *s;
+	struct file *filp;
 };
 
 /**
@@ -57,41 +60,26 @@ struct v9fs_trans_sock {
 
 static int v9fs_sock_recv(struct v9fs_transport *trans, void *v, int len)
 {
-	struct msghdr msg;
-	struct kvec iov;
-	int result;
-	mm_segment_t oldfs;
-	struct v9fs_trans_sock *ts = trans ? trans->priv : NULL;
+	int ret;
+	struct v9fs_trans_sock *ts;
 
-	if (trans->status == Disconnected)
+	if (!trans || trans->status == Disconnected) {
+		dprintk(DEBUG_ERROR, "disconnected ...\n");
 		return -EREMOTEIO;
+	}
 
-	result = -EINVAL;
-
-	oldfs = get_fs();
-	set_fs(get_ds());
-
-	iov.iov_base = v;
-	iov.iov_len = len;
-	msg.msg_name = NULL;
-	msg.msg_namelen = 0;
-	msg.msg_iovlen = 1;
-	msg.msg_control = NULL;
-	msg.msg_controllen = 0;
-	msg.msg_namelen = 0;
-	msg.msg_flags = MSG_NOSIGNAL;
-
-	result = kernel_recvmsg(ts->s, &msg, &iov, 1, len, 0);
+	ts = trans->priv;
 
-	dprintk(DEBUG_TRANS, "socket state %d\n", ts->s->state);
-	set_fs(oldfs);
+	if (!(ts->filp->f_flags & O_NONBLOCK))
+		dprintk(DEBUG_ERROR, "blocking read ...\n");
 
-	if (result <= 0) {
-		if (result != -ERESTARTSYS)
+	ret = kernel_read(ts->filp, ts->filp->f_pos, v, len);
+	if (ret <= 0) {
+		if (ret != -ERESTARTSYS && ret != -EAGAIN)
 			trans->status = Disconnected;
 	}
 
-	return result;
+	return ret;
 }
 
 /**
@@ -104,40 +92,73 @@ static int v9fs_sock_recv(struct v9fs_transport *trans, void *v, int len)
 
 static int v9fs_sock_send(struct v9fs_transport *trans, void *v, int len)
 {
-	struct kvec iov;
-	struct msghdr msg;
-	int result = -1;
+	int ret;
 	mm_segment_t oldfs;
-	struct v9fs_trans_sock *ts = trans ? trans->priv : NULL;
+	struct v9fs_trans_sock *ts;
 
-	dprintk(DEBUG_TRANS, "Sending packet size %d (%x)\n", len, len);
-	dump_data(v, len);
+	if (!trans || trans->status == Disconnected) {
+		dprintk(DEBUG_ERROR, "disconnected ...\n");
+		return -EREMOTEIO;
+	}
+
+	ts = trans->priv;
+	if (!ts) {
+		dprintk(DEBUG_ERROR, "no transport ...\n");
+		return -EREMOTEIO;
+	}
 
-	down(&trans->writelock);
+	if (!(ts->filp->f_flags & O_NONBLOCK))
+		dprintk(DEBUG_ERROR, "blocking write ...\n");
 
+	dump_data(v, len);
 	oldfs = get_fs();
 	set_fs(get_ds());
-	iov.iov_base = v;
-	iov.iov_len = len;
-	msg.msg_name = NULL;
-	msg.msg_namelen = 0;
-	msg.msg_iovlen = 1;
-	msg.msg_control = NULL;
-	msg.msg_controllen = 0;
-	msg.msg_namelen = 0;
-	msg.msg_flags = MSG_NOSIGNAL;
-	result = kernel_sendmsg(ts->s, &msg, &iov, 1, len);
+	ret = vfs_write(ts->filp, (void __user *)v, len, &ts->filp->f_pos);
 	set_fs(oldfs);
 
-	if (result < 0) {
-		if (result != -ERESTARTSYS)
+	if (ret < 0) {
+		if (ret != -ERESTARTSYS)
 			trans->status = Disconnected;
 	}
 
-	up(&trans->writelock);
-	return result;
+	return ret;
+}
+
+static unsigned int v9fs_sock_poll(struct v9fs_transport *trans,
+	struct poll_table_struct *pt) {
+
+	int ret;
+	struct v9fs_trans_sock *ts;
+	mm_segment_t oldfs;
+
+	if (!trans) {
+		dprintk(DEBUG_ERROR, "no transport\n");
+		return -EIO;
+	}
+
+	ts = trans->priv;
+	if (trans->status != Connected || !ts) {
+		dprintk(DEBUG_ERROR, "transport disconnected: %d\n", trans->status);
+		return -EIO;
+	}
+
+	oldfs = get_fs();
+	set_fs(get_ds());
+
+	if (!ts->filp->f_op || !ts->filp->f_op->poll) {
+		dprintk(DEBUG_ERROR, "no poll operation\n");
+		ret = -EIO;
+		goto end;
+	}
+
+	ret = ts->filp->f_op->poll(ts->filp, pt);
+
+end:
+	set_fs(oldfs);
+	return ret;
 }
 
+
 /**
  * v9fs_tcp_init - initialize TCP socket
  * @v9ses: session information
@@ -154,9 +175,9 @@ v9fs_tcp_init(struct v9fs_session_info *v9ses, const char *addr, char *data)
 	int rc = 0;
 	struct v9fs_trans_sock *ts = NULL;
 	struct v9fs_transport *trans = v9ses->transport;
+	int fd;
 
-	sema_init(&trans->writelock, 1);
-	sema_init(&trans->readlock, 1);
+	trans->status = Disconnected;
 
 	ts = kmalloc(sizeof(struct v9fs_trans_sock), GFP_KERNEL);
 
@@ -165,6 +186,7 @@ v9fs_tcp_init(struct v9fs_session_info *v9ses, const char *addr, char *data)
 
 	trans->priv = ts;
 	ts->s = NULL;
+	ts->filp = NULL;
 
 	if (!addr)
 		return -EINVAL;
@@ -185,7 +207,18 @@ v9fs_tcp_init(struct v9fs_session_info *v9ses, const char *addr, char *data)
 		return rc;
 	}
 	csocket->sk->sk_allocation = GFP_NOIO;
+
+	fd = sock_map_fd(csocket);
+	if (fd < 0) {
+		sock_release(csocket);
+		kfree(ts);
+		trans->priv = NULL;
+		return fd;
+	}
+
 	ts->s = csocket;
+	ts->filp = fget(fd);
+	ts->filp->f_flags |= O_NONBLOCK;
 	trans->status = Connected;
 
 	return 0;
@@ -203,7 +236,7 @@ static int
 v9fs_unix_init(struct v9fs_session_info *v9ses, const char *dev_name,
 	       char *data)
 {
-	int rc;
+	int rc, fd;
 	struct socket *csocket;
 	struct sockaddr_un sun_server;
 	struct v9fs_transport *trans;
@@ -213,6 +246,8 @@ v9fs_unix_init(struct v9fs_session_info *v9ses, const char *dev_name,
 	csocket = NULL;
 	trans = v9ses->transport;
 
+	trans->status = Disconnected;
+
 	if (strlen(dev_name) > UNIX_PATH_MAX) {
 		eprintk(KERN_ERR, "v9fs_trans_unix: address too long: %s\n",
 			dev_name);
@@ -225,9 +260,7 @@ v9fs_unix_init(struct v9fs_session_info *v9ses, const char *dev_name,
 
 	trans->priv = ts;
 	ts->s = NULL;
-
-	sema_init(&trans->writelock, 1);
-	sema_init(&trans->readlock, 1);
+	ts->filp = NULL;
 
 	sun_server.sun_family = PF_UNIX;
 	strcpy(sun_server.sun_path, dev_name);
@@ -241,7 +274,18 @@ v9fs_unix_init(struct v9fs_session_info *v9ses, const char *dev_name,
 		return rc;
 	}
 	csocket->sk->sk_allocation = GFP_NOIO;
+
+	fd = sock_map_fd(csocket);
+	if (fd < 0) {
+		sock_release(csocket);
+		kfree(ts);
+		trans->priv = NULL;
+		return fd;
+	}
+
 	ts->s = csocket;
+	ts->filp = fget(fd);
+	ts->filp->f_flags |= O_NONBLOCK;
 	trans->status = Connected;
 
 	return 0;
@@ -262,12 +306,11 @@ static void v9fs_sock_close(struct v9fs_transport *trans)
 
 	ts = trans->priv;
 
-	if ((ts) && (ts->s)) {
-		dprintk(DEBUG_TRANS, "closing the socket %p\n", ts->s);
-		sock_release(ts->s);
+	if ((ts) && (ts->filp)) {
+		fput(ts->filp);
+		ts->filp = NULL;
 		ts->s = NULL;
 		trans->status = Disconnected;
-		dprintk(DEBUG_TRANS, "socket closed\n");
 	}
 
 	kfree(ts);
@@ -280,6 +323,7 @@ struct v9fs_transport v9fs_trans_tcp = {
 	.write = v9fs_sock_send,
 	.read = v9fs_sock_recv,
 	.close = v9fs_sock_close,
+	.poll = v9fs_sock_poll,
 };
 
 struct v9fs_transport v9fs_trans_unix = {
@@ -287,4 +331,5 @@ struct v9fs_transport v9fs_trans_unix = {
 	.write = v9fs_sock_send,
 	.read = v9fs_sock_recv,
 	.close = v9fs_sock_close,
+	.poll = v9fs_sock_poll,
 };
diff --git a/fs/9p/transport.h b/fs/9p/transport.h
index 9e9cd418ef..91fcdb94b3 100644
--- a/fs/9p/transport.h
+++ b/fs/9p/transport.h
@@ -3,6 +3,7 @@
  *
  * Transport Definition
  *
+ *  Copyright (C) 2005 by Latchesar Ionkov <lucho@ionkov.net>
  *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
  *
  *  This program is free software; you can redistribute it and/or modify
@@ -31,14 +32,13 @@ enum v9fs_transport_status {
 
 struct v9fs_transport {
 	enum v9fs_transport_status status;
-	struct semaphore writelock;
-	struct semaphore readlock;
 	void *priv;
 
 	int (*init) (struct v9fs_session_info *, const char *, char *);
 	int (*write) (struct v9fs_transport *, void *, int);
 	int (*read) (struct v9fs_transport *, void *, int);
 	void (*close) (struct v9fs_transport *);
+	unsigned int (*poll)(struct v9fs_transport *, struct poll_table_struct *);
 };
 
 extern struct v9fs_transport v9fs_trans_tcp;
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 418c3743fd..5e0f79355f 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -213,7 +213,8 @@ retry:
 		return -1;
 	}
 
-	error = idr_get_new(&p->pool, NULL, &i);
+	/* no need to store exactly p, we just need something non-null */
+	error = idr_get_new(&p->pool, p, &i);
 	up(&p->lock);
 
 	if (error == -EAGAIN)
@@ -242,6 +243,16 @@ void v9fs_put_idpool(int id, struct v9fs_idpool *p)
 	up(&p->lock);
 }
 
+/**
+ * v9fs_check_idpool - check if the specified id is available
+ * @id - id to check
+ * @p - pool
+ */
+int v9fs_check_idpool(int id, struct v9fs_idpool *p)
+{
+	return idr_find(&p->pool, id) != NULL;
+}
+
 /**
  * v9fs_session_init - initialize session
  * @v9ses: session information structure
@@ -281,9 +292,6 @@ v9fs_session_init(struct v9fs_session_info *v9ses,
 	/* id pools that are session-dependent: FIDs and TIDs */
 	idr_init(&v9ses->fidpool.pool);
 	init_MUTEX(&v9ses->fidpool.lock);
-	idr_init(&v9ses->tidpool.pool);
-	init_MUTEX(&v9ses->tidpool.lock);
-
 
 	switch (v9ses->proto) {
 	case PROTO_TCP:
@@ -320,7 +328,12 @@ v9fs_session_init(struct v9fs_session_info *v9ses,
 	v9ses->shutdown = 0;
 	v9ses->session_hung = 0;
 
-	if ((retval = v9fs_mux_init(v9ses, dev_name)) < 0) {
+	v9ses->mux = v9fs_mux_init(v9ses->transport, v9ses->maxdata + V9FS_IOHDRSZ,
+		&v9ses->extended);
+
+	if (IS_ERR(v9ses->mux)) {
+		retval = PTR_ERR(v9ses->mux);
+		v9ses->mux = NULL;
 		dprintk(DEBUG_ERROR, "problem initializing mux\n");
 		goto SessCleanUp;
 	}
@@ -381,7 +394,7 @@ v9fs_session_init(struct v9fs_session_info *v9ses,
 	}
 
 	if (v9ses->afid != ~0) {
-		if (v9fs_t_clunk(v9ses, v9ses->afid, NULL))
+		if (v9fs_t_clunk(v9ses, v9ses->afid))
 			dprintk(DEBUG_ERROR, "clunk failed\n");
 	}
 
@@ -403,13 +416,16 @@ v9fs_session_init(struct v9fs_session_info *v9ses,
 
 void v9fs_session_close(struct v9fs_session_info *v9ses)
 {
-	if (v9ses->recvproc) {
-		send_sig(SIGKILL, v9ses->recvproc, 1);
-		wait_for_completion(&v9ses->proccmpl);
+	if (v9ses->mux) {
+		v9fs_mux_destroy(v9ses->mux);
+		v9ses->mux = NULL;
 	}
 
-	if (v9ses->transport)
+	if (v9ses->transport) {
 		v9ses->transport->close(v9ses->transport);
+		kfree(v9ses->transport);
+		v9ses->transport = NULL;
+	}
 
 	__putname(v9ses->name);
 	__putname(v9ses->remotename);
@@ -420,8 +436,9 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
  * 	and cancel all pending requests.
  */
 void v9fs_session_cancel(struct v9fs_session_info *v9ses) {
+	dprintk(DEBUG_ERROR, "cancel session %p\n", v9ses);
 	v9ses->transport->status = Disconnected;
-	v9fs_mux_cancel_requests(v9ses, -EIO);
+	v9fs_mux_cancel(v9ses->mux, -EIO);
 }
 
 extern int v9fs_error_init(void);
@@ -437,6 +454,7 @@ static int __init init_v9fs(void)
 
 	printk(KERN_INFO "Installing v9fs 9P2000 file system support\n");
 
+	v9fs_mux_global_init();
 	return register_filesystem(&v9fs_fs_type);
 }
 
@@ -447,6 +465,7 @@ static int __init init_v9fs(void)
 
 static void __exit exit_v9fs(void)
 {
+	v9fs_mux_global_exit();
 	unregister_filesystem(&v9fs_fs_type);
 }
 
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 45dcef42bd..f337da7a0e 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -57,24 +57,14 @@ struct v9fs_session_info {
 
 	/* book keeping */
 	struct v9fs_idpool fidpool;	/* The FID pool for file descriptors */
-	struct v9fs_idpool tidpool;	/* The TID pool for transactions ids */
 
-	/* transport information */
 	struct v9fs_transport *transport;
+	struct v9fs_mux_data *mux;
 
 	int inprogress;		/* session in progress => true */
 	int shutdown;		/* session shutting down. no more attaches. */
 	unsigned char session_hung;
-
-	/* mux private data */
-	struct v9fs_fcall *curfcall;
-	wait_queue_head_t read_wait;
-	struct completion fcread;
-	struct completion proccmpl;
-	struct task_struct *recvproc;
-
-	spinlock_t muxlock;
-	struct list_head mux_fcalls;
+	struct dentry *debugfs_dir;
 };
 
 /* possible values of ->proto */
@@ -84,11 +74,14 @@ enum {
 	PROTO_FD,
 };
 
+extern struct dentry *v9fs_debugfs_root;
+
 int v9fs_session_init(struct v9fs_session_info *, const char *, char *);
 struct v9fs_session_info *v9fs_inode2v9ses(struct inode *);
 void v9fs_session_close(struct v9fs_session_info *v9ses);
 int v9fs_get_idpool(struct v9fs_idpool *p);
 void v9fs_put_idpool(int id, struct v9fs_idpool *p);
+int v9fs_check_idpool(int id, struct v9fs_idpool *p);
 void v9fs_session_cancel(struct v9fs_session_info *v9ses);
 
 #define V9FS_MAGIC 0x01021997
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index a6aa947de0..4887df7673 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -95,24 +95,21 @@ static int v9fs_dentry_validate(struct dentry *dentry, struct nameidata *nd)
 
 void v9fs_dentry_release(struct dentry *dentry)
 {
+	int err;
+
 	dprintk(DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_iname, dentry);
 
 	if (dentry->d_fsdata != NULL) {
 		struct list_head *fid_list = dentry->d_fsdata;
 		struct v9fs_fid *temp = NULL;
 		struct v9fs_fid *current_fid = NULL;
-		struct v9fs_fcall *fcall = NULL;
 
 		list_for_each_entry_safe(current_fid, temp, fid_list, list) {
-			if (v9fs_t_clunk
-			    (current_fid->v9ses, current_fid->fid, &fcall))
-				dprintk(DEBUG_ERROR, "clunk failed: %s\n",
-					FCALL_ERROR(fcall));
+			err = v9fs_t_clunk(current_fid->v9ses, current_fid->fid);
 
-			v9fs_put_idpool(current_fid->fid,
-					&current_fid->v9ses->fidpool);
+			if (err < 0)
+				dprintk(DEBUG_ERROR, "clunk failed: %d\n", err);
 
-			kfree(fcall);
 			v9fs_fid_destroy(current_fid);
 		}
 
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 17089d1905..3893dd307d 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -74,7 +74,7 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	struct inode *inode = filp->f_dentry->d_inode;
 	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode);
 	struct v9fs_fid *file = filp->private_data;
-	unsigned int i, n;
+	unsigned int i, n, s;
 	int fid = -1;
 	int ret = 0;
 	struct v9fs_stat *mi = NULL;
@@ -97,9 +97,9 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		n = file->rdir_fcall->params.rread.count;
 		i = file->rdir_fpos;
 		while (i < n) {
-			int s = v9fs_deserialize_stat(v9ses,
-				  file->rdir_fcall->params.rread.data + i,
-			          n - i, mi, v9ses->maxdata);
+			s = v9fs_deserialize_stat(
+				file->rdir_fcall->params.rread.data + i,
+				n - i, mi, v9ses->maxdata, v9ses->extended);
 
 			if (s == 0) {
 				dprintk(DEBUG_ERROR,
@@ -141,9 +141,8 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		n = ret;
 		i = 0;
 		while (i < n) {
-			int s = v9fs_deserialize_stat(v9ses,
-			          fcall->params.rread.data + i, n - i, mi,
-			          v9ses->maxdata);
+			s = v9fs_deserialize_stat(fcall->params.rread.data + i,
+				n - i, mi, v9ses->maxdata, v9ses->extended);
 
 			if (s == 0) {
 				dprintk(DEBUG_ERROR,
@@ -199,11 +198,9 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
 		dprintk(DEBUG_VFS, "fidopen: %d v9f->fid: %d\n", fid->fidopen,
 			fid->fid);
 
-		if (v9fs_t_clunk(v9ses, fidnum, NULL))
+		if (v9fs_t_clunk(v9ses, fidnum))
 			dprintk(DEBUG_ERROR, "clunk failed\n");
 
-		v9fs_put_idpool(fid->fid, &v9ses->fidpool);
-
 		kfree(fid->rdir_fcall);
 		kfree(fid);
 
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 0ea965c3bb..466002a1fe 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -318,6 +318,7 @@ v9fs_create(struct inode *dir,
 	int result = 0;
 	unsigned int iounit = 0;
 	int wfidno = -1;
+	int err;
 
 	perm = unixmode2p9mode(v9ses, perm);
 
@@ -356,6 +357,7 @@ v9fs_create(struct inode *dir,
 	}
 
 	kfree(fcall);
+	fcall = NULL;
 
 	result = v9fs_t_create(v9ses, newfid, (char *)file_dentry->d_name.name,
 			       perm, open_mode, &fcall);
@@ -369,16 +371,23 @@ v9fs_create(struct inode *dir,
 	iounit = fcall->params.rcreate.iounit;
 	qid = fcall->params.rcreate.qid;
 	kfree(fcall);
+	fcall = NULL;
 
-	fid = v9fs_fid_create(file_dentry, v9ses, newfid, 1);
-	dprintk(DEBUG_VFS, "fid %p %d\n", fid, fid->fidcreate);
-	if (!fid) {
-		result = -ENOMEM;
-		goto CleanUpFid;
-	}
+	if (!(perm&V9FS_DMDIR)) {
+		fid = v9fs_fid_create(file_dentry, v9ses, newfid, 1);
+		dprintk(DEBUG_VFS, "fid %p %d\n", fid, fid->fidcreate);
+		if (!fid) {
+			result = -ENOMEM;
+			goto CleanUpFid;
+		}
 
-	fid->qid = qid;
-	fid->iounit = iounit;
+		fid->qid = qid;
+		fid->iounit = iounit;
+	} else {
+		err = v9fs_t_clunk(v9ses, newfid);
+		if (err < 0)
+			dprintk(DEBUG_ERROR, "clunk for mkdir failed: %d\n", err);
+	}
 
 	/* walk to the newly created file and put the fid in the dentry */
 	wfidno = v9fs_get_idpool(&v9ses->fidpool);
@@ -388,18 +397,19 @@ v9fs_create(struct inode *dir,
 	}
 
 	result = v9fs_t_walk(v9ses, dirfidnum, wfidno,
-		(char *) file_dentry->d_name.name, NULL);
+		(char *) file_dentry->d_name.name, &fcall);
 	if (result < 0) {
 		dprintk(DEBUG_ERROR, "clone error: %s\n", FCALL_ERROR(fcall));
 		v9fs_put_idpool(wfidno, &v9ses->fidpool);
 		wfidno = -1;
 		goto CleanUpFid;
 	}
+	kfree(fcall);
+	fcall = NULL;
 
 	if (!v9fs_fid_create(file_dentry, v9ses, wfidno, 0)) {
-		if (!v9fs_t_clunk(v9ses, newfid, &fcall)) {
-			v9fs_put_idpool(wfidno, &v9ses->fidpool);
-		}
+		v9fs_t_clunk(v9ses, newfid);
+		v9fs_put_idpool(wfidno, &v9ses->fidpool);
 
 		goto CleanUpFid;
 	}
@@ -431,40 +441,21 @@ v9fs_create(struct inode *dir,
 	file_dentry->d_op = &v9fs_dentry_operations;
 	d_instantiate(file_dentry, file_inode);
 
-	if (perm & V9FS_DMDIR) {
-		if (!v9fs_t_clunk(v9ses, newfid, &fcall))
-			v9fs_put_idpool(newfid, &v9ses->fidpool);
-		else
-			dprintk(DEBUG_ERROR, "clunk for mkdir failed: %s\n",
-				FCALL_ERROR(fcall));
-		kfree(fcall);
-		fid->fidopen = 0;
-		fid->fidcreate = 0;
-		d_drop(file_dentry);
-	}
-
 	return 0;
 
       CleanUpFid:
 	kfree(fcall);
+	fcall = NULL;
 
 	if (newfid >= 0) {
-		if (!v9fs_t_clunk(v9ses, newfid, &fcall))
-			v9fs_put_idpool(newfid, &v9ses->fidpool);
-		else
-			dprintk(DEBUG_ERROR, "clunk failed: %s\n",
-				FCALL_ERROR(fcall));
-
-		kfree(fcall);
+ 		err = v9fs_t_clunk(v9ses, newfid);
+ 		if (err < 0)
+ 			dprintk(DEBUG_ERROR, "clunk failed: %d\n", err);
 	}
 	if (wfidno >= 0) {
-		if (!v9fs_t_clunk(v9ses, wfidno, &fcall))
-			v9fs_put_idpool(wfidno, &v9ses->fidpool);
-		else
-			dprintk(DEBUG_ERROR, "clunk failed: %s\n",
-				FCALL_ERROR(fcall));
-
-		kfree(fcall);
+ 		err = v9fs_t_clunk(v9ses, wfidno);
+ 		if (err < 0)
+ 			dprintk(DEBUG_ERROR, "clunk failed: %d\n", err);
 	}
 	return result;
 }
@@ -972,6 +963,7 @@ v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
 	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir);
 	struct v9fs_fcall *fcall = NULL;
 	struct v9fs_stat *mistat = kmalloc(v9ses->maxdata, GFP_KERNEL);
+	int err;
 
 	dprintk(DEBUG_VFS, " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name,
 		symname);
@@ -1004,9 +996,9 @@ v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
 
 	kfree(fcall);
 
-	if (v9fs_t_clunk(v9ses, newfid->fid, &fcall)) {
-		dprintk(DEBUG_ERROR, "clunk for symlink failed: %s\n",
-			FCALL_ERROR(fcall));
+	err = v9fs_t_clunk(v9ses, newfid->fid);
+	if (err < 0) {
+		dprintk(DEBUG_ERROR, "clunk for symlink failed: %d\n", err);
 		goto FreeFcall;
 	}
 
@@ -1180,6 +1172,7 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
 	struct v9fs_fid *oldfid = v9fs_fid_lookup(old_dentry);
 	struct v9fs_fid *newfid = NULL;
 	char *symname = __getname();
+	int err;
 
 	dprintk(DEBUG_VFS, " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name,
 		old_dentry->d_name.name);
@@ -1216,9 +1209,10 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
 
 	kfree(fcall);
 
-	if (v9fs_t_clunk(v9ses, newfid->fid, &fcall)) {
-		dprintk(DEBUG_ERROR, "clunk for symlink failed: %s\n",
-			FCALL_ERROR(fcall));
+	err = v9fs_t_clunk(v9ses, newfid->fid);
+
+	if (err < 0) {
+		dprintk(DEBUG_ERROR, "clunk for symlink failed: %d\n", err);
 		goto FreeMem;
 	}
 
@@ -1252,6 +1246,7 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
 	struct v9fs_fcall *fcall = NULL;
 	struct v9fs_stat *mistat = kmalloc(v9ses->maxdata, GFP_KERNEL);
 	char *symname = __getname();
+	int err;
 
 	dprintk(DEBUG_VFS, " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino,
 		dentry->d_name.name, mode, MAJOR(rdev), MINOR(rdev));
@@ -1310,9 +1305,9 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
 	/* need to update dcache so we show up */
 	kfree(fcall);
 
-	if (v9fs_t_clunk(v9ses, newfid->fid, &fcall)) {
-		dprintk(DEBUG_ERROR, "clunk for symlink failed: %s\n",
-			FCALL_ERROR(fcall));
+	err = v9fs_t_clunk(v9ses, newfid->fid);
+	if (err < 0) {
+		dprintk(DEBUG_ERROR, "clunk for symlink failed: %d\n", err);
 		goto FreeMem;
 	}
 
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 82c5b00840..83b6edd098 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -129,6 +129,7 @@ static struct super_block *v9fs_get_sb(struct file_system_type
 
 	if ((newfid = v9fs_session_init(v9ses, dev_name, data)) < 0) {
 		dprintk(DEBUG_ERROR, "problem initiating session\n");
+		kfree(v9ses);
 		return ERR_PTR(newfid);
 	}
 
@@ -157,7 +158,7 @@ static struct super_block *v9fs_get_sb(struct file_system_type
 	stat_result = v9fs_t_stat(v9ses, newfid, &fcall);
 	if (stat_result < 0) {
 		dprintk(DEBUG_ERROR, "stat error\n");
-		v9fs_t_clunk(v9ses, newfid, NULL);
+		v9fs_t_clunk(v9ses, newfid);
 		v9fs_put_idpool(newfid, &v9ses->fidpool);
 	} else {
 		/* Setup the Root Inode */
-- 
cgit v1.2.2


From d8da097afb765654c866062148fd98b11db9003e Mon Sep 17 00:00:00 2001
From: Latchesar Ionkov <lucho@ionkov.net>
Date: Sun, 8 Jan 2006 01:04:59 -0800
Subject: [PATCH] v9fs: fix fid management in v9fs_create

v9fs_create doesn't manage correctly the fids when it is called to create a
directory..  The fid created by the create 9P call (newfid) and the one
created by walking to already created file (wfidno) are not used
consistently.

This patch cleans up the usage of newfid and wfidno.

Signed-off-by: Latchesar Ionkov <lucho@ionkov.net>
Cc: Eric Van Hensbergen <ericvh@ericvh.myip.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/9p/vfs_inode.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 466002a1fe..f11edde643 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -385,13 +385,14 @@ v9fs_create(struct inode *dir,
 		fid->iounit = iounit;
 	} else {
 		err = v9fs_t_clunk(v9ses, newfid);
+		newfid = -1;
 		if (err < 0)
 			dprintk(DEBUG_ERROR, "clunk for mkdir failed: %d\n", err);
 	}
 
 	/* walk to the newly created file and put the fid in the dentry */
 	wfidno = v9fs_get_idpool(&v9ses->fidpool);
-	if (newfid < 0) {
+	if (wfidno < 0) {
 		eprintk(KERN_WARNING, "no free fids available\n");
 		return -ENOSPC;
 	}
@@ -408,7 +409,6 @@ v9fs_create(struct inode *dir,
 	fcall = NULL;
 
 	if (!v9fs_fid_create(file_dentry, v9ses, wfidno, 0)) {
-		v9fs_t_clunk(v9ses, newfid);
 		v9fs_put_idpool(wfidno, &v9ses->fidpool);
 
 		goto CleanUpFid;
@@ -419,7 +419,7 @@ v9fs_create(struct inode *dir,
 	    (perm & V9FS_DMDEVICE))
 		return 0;
 
-	result = v9fs_t_stat(v9ses, newfid, &fcall);
+	result = v9fs_t_stat(v9ses, wfidno, &fcall);
 	if (result < 0) {
 		dprintk(DEBUG_ERROR, "stat error: %s(%d)\n", FCALL_ERROR(fcall),
 			result);
-- 
cgit v1.2.2


From 531b1094b74365dcc55fa464d28a9a2497ae825d Mon Sep 17 00:00:00 2001
From: Latchesar Ionkov <lucho@ionkov.net>
Date: Sun, 8 Jan 2006 01:05:00 -0800
Subject: [PATCH] v9fs: zero copy implementation

Performance enhancement reducing the number of copies in the data and
stat paths.

Signed-off-by: Latchesar Ionkov <lucho@ionkov.net>
Cc: Eric Van Hensbergen <ericvh@ericvh.myip.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/9p/9p.c         | 302 ++++++++++--------
 fs/9p/9p.h         |  75 +++--
 fs/9p/Makefile     |  10 +-
 fs/9p/conv.c       | 895 +++++++++++++++++++++++++++++++----------------------
 fs/9p/conv.h       |  28 +-
 fs/9p/debug.h      |  23 +-
 fs/9p/error.c      |  10 +-
 fs/9p/error.h      |   3 +-
 fs/9p/fid.c        |   3 -
 fs/9p/mux.c        | 157 +++++-----
 fs/9p/trans_sock.c |   1 -
 fs/9p/v9fs.c       |   3 +-
 fs/9p/v9fs_vfs.h   |   5 +-
 fs/9p/vfs_dentry.c |   4 +-
 fs/9p/vfs_dir.c    |  31 +-
 fs/9p/vfs_file.c   |  25 +-
 fs/9p/vfs_inode.c  | 545 +++++++++++---------------------
 fs/9p/vfs_super.c  |  10 +-
 18 files changed, 1083 insertions(+), 1047 deletions(-)

diff --git a/fs/9p/9p.c b/fs/9p/9p.c
index a3a1ac6107..dc3ce44ec8 100644
--- a/fs/9p/9p.c
+++ b/fs/9p/9p.c
@@ -1,8 +1,9 @@
 /*
  *  linux/fs/9p/9p.c
  *
- *  This file contains functions 9P2000 functions
+ *  This file contains functions to perform synchronous 9P calls
  *
+ *  Copyright (C) 2004 by Latchesar Ionkov <lucho@ionkov.net>
  *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
  *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
  *
@@ -33,6 +34,7 @@
 #include "debug.h"
 #include "v9fs.h"
 #include "9p.h"
+#include "conv.h"
 #include "mux.h"
 
 /**
@@ -46,17 +48,21 @@
 
 int
 v9fs_t_version(struct v9fs_session_info *v9ses, u32 msize,
-	       char *version, struct v9fs_fcall **fcall)
+	       char *version, struct v9fs_fcall **rcp)
 {
-	struct v9fs_fcall msg;
+	int ret;
+	struct v9fs_fcall *tc;
 
 	dprintk(DEBUG_9P, "msize: %d version: %s\n", msize, version);
-	msg.id = TVERSION;
-	msg.tag = ~0;
-	msg.params.tversion.msize = msize;
-	msg.params.tversion.version = version;
+	tc = v9fs_create_tversion(msize, version);
 
-	return v9fs_mux_rpc(v9ses->mux, &msg, fcall);
+	if (!IS_ERR(tc)) {
+		ret = v9fs_mux_rpc(v9ses->mux, tc, rcp);
+		kfree(tc);
+	} else
+		ret = PTR_ERR(tc);
+
+	return ret;
 }
 
 /**
@@ -72,19 +78,23 @@ v9fs_t_version(struct v9fs_session_info *v9ses, u32 msize,
 
 int
 v9fs_t_attach(struct v9fs_session_info *v9ses, char *uname, char *aname,
-	      u32 fid, u32 afid, struct v9fs_fcall **fcall)
+	      u32 fid, u32 afid, struct v9fs_fcall **rcp)
 {
-	struct v9fs_fcall msg;
+	int ret;
+	struct v9fs_fcall* tc;
 
 	dprintk(DEBUG_9P, "uname '%s' aname '%s' fid %d afid %d\n", uname,
 		aname, fid, afid);
-	msg.id = TATTACH;
-	msg.params.tattach.fid = fid;
-	msg.params.tattach.afid = afid;
-	msg.params.tattach.uname = uname;
-	msg.params.tattach.aname = aname;
 
-	return v9fs_mux_rpc(v9ses->mux, &msg, fcall);
+	ret = -ENOMEM;
+	tc = v9fs_create_tattach(fid, afid, uname, aname);
+	if (!IS_ERR(tc)) {
+		ret = v9fs_mux_rpc(v9ses->mux, tc, rcp);
+		kfree(tc);
+	} else
+		ret = PTR_ERR(tc);
+
+	return ret;
 }
 
 static void v9fs_t_clunk_cb(void *a, struct v9fs_fcall *tc,
@@ -117,24 +127,28 @@ static void v9fs_t_clunk_cb(void *a, struct v9fs_fcall *tc,
  * @fcall: pointer to response fcall pointer
  *
  */
+
 int
 v9fs_t_clunk(struct v9fs_session_info *v9ses, u32 fid)
 {
-	int err;
+	int ret;
 	struct v9fs_fcall *tc, *rc;
 
-	tc = kmalloc(sizeof(struct v9fs_fcall), GFP_KERNEL);
-
 	dprintk(DEBUG_9P, "fid %d\n", fid);
-	tc->id = TCLUNK;
-	tc->params.tclunk.fid = fid;
 
-	err = v9fs_mux_rpc(v9ses->mux, tc, &rc);
-	if (err >= 0) {
-		v9fs_t_clunk_cb(v9ses, tc, rc, 0);
-	}
+	ret = -ENOMEM;
+	rc = NULL;
+	tc = v9fs_create_tclunk(fid);
+	if (!IS_ERR(tc))
+		ret = v9fs_mux_rpc(v9ses->mux, tc, &rc);
+	else
+		ret = PTR_ERR(tc);
+
+	if (ret)
+		dprintk(DEBUG_ERROR, "failed fid %d err %d\n", fid, ret);
 
-	return err;
+	v9fs_t_clunk_cb(v9ses, tc, rc, ret);
+	return ret;
 }
 
 /**
@@ -144,14 +158,22 @@ v9fs_t_clunk(struct v9fs_session_info *v9ses, u32 fid)
  *
  */
 
-int v9fs_t_flush(struct v9fs_session_info *v9ses, u16 tag)
+int v9fs_t_flush(struct v9fs_session_info *v9ses, u16 oldtag)
 {
-	struct v9fs_fcall msg;
+	int ret;
+	struct v9fs_fcall *tc;
 
-	dprintk(DEBUG_9P, "oldtag %d\n", tag);
-	msg.id = TFLUSH;
-	msg.params.tflush.oldtag = tag;
-	return v9fs_mux_rpc(v9ses->mux, &msg, NULL);
+	dprintk(DEBUG_9P, "oldtag %d\n", oldtag);
+
+	ret = -ENOMEM;
+	tc = v9fs_create_tflush(oldtag);
+	if (!IS_ERR(tc)) {
+		ret = v9fs_mux_rpc(v9ses->mux, tc, NULL);
+		kfree(tc);
+	} else
+		ret = PTR_ERR(tc);
+
+	return ret;
 }
 
 /**
@@ -163,17 +185,22 @@ int v9fs_t_flush(struct v9fs_session_info *v9ses, u16 tag)
  */
 
 int
-v9fs_t_stat(struct v9fs_session_info *v9ses, u32 fid, struct v9fs_fcall **fcall)
+v9fs_t_stat(struct v9fs_session_info *v9ses, u32 fid, struct v9fs_fcall **rcp)
 {
-	struct v9fs_fcall msg;
+	int ret;
+	struct v9fs_fcall *tc;
 
 	dprintk(DEBUG_9P, "fid %d\n", fid);
-	if (fcall)
-		*fcall = NULL;
 
-	msg.id = TSTAT;
-	msg.params.tstat.fid = fid;
-	return v9fs_mux_rpc(v9ses->mux, &msg, fcall);
+	ret = -ENOMEM;
+	tc = v9fs_create_tstat(fid);
+	if (!IS_ERR(tc)) {
+		ret = v9fs_mux_rpc(v9ses->mux, tc, rcp);
+		kfree(tc);
+	} else
+		ret = PTR_ERR(tc);
+
+	return ret;
 }
 
 /**
@@ -187,16 +214,22 @@ v9fs_t_stat(struct v9fs_session_info *v9ses, u32 fid, struct v9fs_fcall **fcall)
 
 int
 v9fs_t_wstat(struct v9fs_session_info *v9ses, u32 fid,
-	     struct v9fs_stat *stat, struct v9fs_fcall **fcall)
+	     struct v9fs_wstat *wstat, struct v9fs_fcall **rcp)
 {
-	struct v9fs_fcall msg;
+	int ret;
+	struct v9fs_fcall *tc;
 
-	dprintk(DEBUG_9P, "fid %d length %d\n", fid, (int)stat->length);
-	msg.id = TWSTAT;
-	msg.params.twstat.fid = fid;
-	msg.params.twstat.stat = stat;
+	dprintk(DEBUG_9P, "fid %d\n", fid);
+
+	ret = -ENOMEM;
+	tc = v9fs_create_twstat(fid, wstat, v9ses->extended);
+	if (!IS_ERR(tc)) {
+		ret = v9fs_mux_rpc(v9ses->mux, tc, rcp);
+		kfree(tc);
+	} else
+		ret = PTR_ERR(tc);
 
-	return v9fs_mux_rpc(v9ses->mux, &msg, fcall);
+	return ret;
 }
 
 /**
@@ -213,23 +246,28 @@ v9fs_t_wstat(struct v9fs_session_info *v9ses, u32 fid,
 
 int
 v9fs_t_walk(struct v9fs_session_info *v9ses, u32 fid, u32 newfid,
-	    char *name, struct v9fs_fcall **fcall)
+	    char *name, struct v9fs_fcall **rcp)
 {
-	struct v9fs_fcall msg;
+	int ret;
+	struct v9fs_fcall *tc;
+	int nwname;
 
 	dprintk(DEBUG_9P, "fid %d newfid %d wname '%s'\n", fid, newfid, name);
-	msg.id = TWALK;
-	msg.params.twalk.fid = fid;
-	msg.params.twalk.newfid = newfid;
-
-	if (name) {
-		msg.params.twalk.nwname = 1;
-		msg.params.twalk.wnames = &name;
-	} else {
-		msg.params.twalk.nwname = 0;
-	}
-
-	return v9fs_mux_rpc(v9ses->mux, &msg, fcall);
+
+	if (name)
+		nwname = 1;
+	else
+		nwname = 0;
+
+	ret = -ENOMEM;
+	tc = v9fs_create_twalk(fid, newfid, nwname, &name);
+	if (!IS_ERR(tc)) {
+		ret = v9fs_mux_rpc(v9ses->mux, tc, rcp);
+		kfree(tc);
+	} else
+		ret = PTR_ERR(tc);
+
+	return ret;
 }
 
 /**
@@ -244,19 +282,22 @@ v9fs_t_walk(struct v9fs_session_info *v9ses, u32 fid, u32 newfid,
 
 int
 v9fs_t_open(struct v9fs_session_info *v9ses, u32 fid, u8 mode,
-	    struct v9fs_fcall **fcall)
+	    struct v9fs_fcall **rcp)
 {
-	struct v9fs_fcall msg;
-	int errorno = -1;
+	int ret;
+	struct v9fs_fcall *tc;
 
 	dprintk(DEBUG_9P, "fid %d mode %d\n", fid, mode);
-	msg.id = TOPEN;
-	msg.params.topen.fid = fid;
-	msg.params.topen.mode = mode;
 
-	errorno = v9fs_mux_rpc(v9ses->mux, &msg, fcall);
+	ret = -ENOMEM;
+	tc = v9fs_create_topen(fid, mode);
+	if (!IS_ERR(tc)) {
+		ret = v9fs_mux_rpc(v9ses->mux, tc, rcp);
+		kfree(tc);
+	} else
+		ret = PTR_ERR(tc);
 
-	return errorno;
+	return ret;
 }
 
 /**
@@ -269,14 +310,22 @@ v9fs_t_open(struct v9fs_session_info *v9ses, u32 fid, u8 mode,
 
 int
 v9fs_t_remove(struct v9fs_session_info *v9ses, u32 fid,
-	      struct v9fs_fcall **fcall)
+	      struct v9fs_fcall **rcp)
 {
-	struct v9fs_fcall msg;
+	int ret;
+	struct v9fs_fcall *tc;
 
 	dprintk(DEBUG_9P, "fid %d\n", fid);
-	msg.id = TREMOVE;
-	msg.params.tremove.fid = fid;
-	return v9fs_mux_rpc(v9ses->mux, &msg, fcall);
+
+	ret = -ENOMEM;
+	tc = v9fs_create_tremove(fid);
+	if (!IS_ERR(tc)) {
+		ret = v9fs_mux_rpc(v9ses->mux, tc, rcp);
+		kfree(tc);
+	} else
+		ret = PTR_ERR(tc);
+
+	return ret;
 }
 
 /**
@@ -292,20 +341,23 @@ v9fs_t_remove(struct v9fs_session_info *v9ses, u32 fid,
 
 int
 v9fs_t_create(struct v9fs_session_info *v9ses, u32 fid, char *name,
-	      u32 perm, u8 mode, struct v9fs_fcall **fcall)
+	      u32 perm, u8 mode, struct v9fs_fcall **rcp)
 {
-	struct v9fs_fcall msg;
+	int ret;
+	struct v9fs_fcall *tc;
 
 	dprintk(DEBUG_9P, "fid %d name '%s' perm %x mode %d\n",
 		fid, name, perm, mode);
 
-	msg.id = TCREATE;
-	msg.params.tcreate.fid = fid;
-	msg.params.tcreate.name = name;
-	msg.params.tcreate.perm = perm;
-	msg.params.tcreate.mode = mode;
+	ret = -ENOMEM;
+	tc = v9fs_create_tcreate(fid, name, perm, mode);
+	if (!IS_ERR(tc)) {
+		ret = v9fs_mux_rpc(v9ses->mux, tc, rcp);
+		kfree(tc);
+	} else
+		ret = PTR_ERR(tc);
 
-	return v9fs_mux_rpc(v9ses->mux, &msg, fcall);
+	return ret;
 }
 
 /**
@@ -320,31 +372,30 @@ v9fs_t_create(struct v9fs_session_info *v9ses, u32 fid, char *name,
 
 int
 v9fs_t_read(struct v9fs_session_info *v9ses, u32 fid, u64 offset,
-	    u32 count, struct v9fs_fcall **fcall)
+	    u32 count, struct v9fs_fcall **rcp)
 {
-	struct v9fs_fcall msg;
-	struct v9fs_fcall *rc = NULL;
-	long errorno = -1;
-
-	dprintk(DEBUG_9P, "fid %d offset 0x%lx count 0x%x\n", fid,
-		(long unsigned int)offset, count);
-	msg.id = TREAD;
-	msg.params.tread.fid = fid;
-	msg.params.tread.offset = offset;
-	msg.params.tread.count = count;
-	errorno = v9fs_mux_rpc(v9ses->mux, &msg, &rc);
-
-	if (!errorno) {
-		errorno = rc->params.rread.count;
-		dump_data(rc->params.rread.data, rc->params.rread.count);
-	}
-
-	if (fcall)
-		*fcall = rc;
-	else
-		kfree(rc);
+	int ret;
+	struct v9fs_fcall *tc, *rc;
 
-	return errorno;
+	dprintk(DEBUG_9P, "fid %d offset 0x%llux count 0x%x\n", fid,
+		(long long unsigned) offset, count);
+
+	ret = -ENOMEM;
+	tc = v9fs_create_tread(fid, offset, count);
+	if (!IS_ERR(tc)) {
+		ret = v9fs_mux_rpc(v9ses->mux, tc, &rc);
+		if (!ret)
+			ret = rc->params.rread.count;
+		if (rcp)
+			*rcp = rc;
+		else
+			kfree(rc);
+
+		kfree(tc);
+	} else
+		ret = PTR_ERR(tc);
+
+	return ret;
 }
 
 /**
@@ -358,32 +409,31 @@ v9fs_t_read(struct v9fs_session_info *v9ses, u32 fid, u64 offset,
  */
 
 int
-v9fs_t_write(struct v9fs_session_info *v9ses, u32 fid,
-	     u64 offset, u32 count, void *data, struct v9fs_fcall **fcall)
+v9fs_t_write(struct v9fs_session_info *v9ses, u32 fid, u64 offset, u32 count,
+	const char __user *data, struct v9fs_fcall **rcp)
 {
-	struct v9fs_fcall msg;
-	struct v9fs_fcall *rc = NULL;
-	long errorno = -1;
+	int ret;
+	struct v9fs_fcall *tc, *rc;
 
-	dprintk(DEBUG_9P, "fid %d offset 0x%llx count 0x%x\n", fid,
-		(unsigned long long)offset, count);
-	dump_data(data, count);
+	dprintk(DEBUG_9P, "fid %d offset 0x%llux count 0x%x\n", fid,
+		(long long unsigned) offset, count);
 
-	msg.id = TWRITE;
-	msg.params.twrite.fid = fid;
-	msg.params.twrite.offset = offset;
-	msg.params.twrite.count = count;
-	msg.params.twrite.data = data;
+	ret = -ENOMEM;
+	tc = v9fs_create_twrite(fid, offset, count, data);
+	if (!IS_ERR(tc)) {
+		ret = v9fs_mux_rpc(v9ses->mux, tc, &rc);
 
-	errorno = v9fs_mux_rpc(v9ses->mux, &msg, &rc);
+		if (!ret)
+			ret = rc->params.rwrite.count;
+		if (rcp)
+			*rcp = rc;
+		else
+			kfree(rc);
 
-	if (!errorno)
-		errorno = rc->params.rwrite.count;
+		kfree(tc);
+	} else
+		ret = PTR_ERR(tc);
 
-	if (fcall)
-		*fcall = rc;
-	else
-		kfree(rc);
-
-	return errorno;
+	return ret;
 }
+
diff --git a/fs/9p/9p.h b/fs/9p/9p.h
index 6355392786..007ff63977 100644
--- a/fs/9p/9p.h
+++ b/fs/9p/9p.h
@@ -3,6 +3,7 @@
  *
  * 9P protocol definitions.
  *
+ *  Copyright (C) 2005 by Latchesar Ionkov <lucho@ionkov.net>
  *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
  *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
  *
@@ -102,10 +103,16 @@ enum {
 
 #define V9FS_NOTAG	(u16)(~0)
 #define V9FS_NOFID	(u32)(~0)
+#define V9FS_MAXWELEM	16
 
 /* ample room for Twrite/Rread header (iounit) */
 #define V9FS_IOHDRSZ	24
 
+struct v9fs_str {
+	u16 len;
+	char *str;
+};
+
 /* qids are the unique ID for a file (like an inode */
 struct v9fs_qid {
 	u8 type;
@@ -115,6 +122,29 @@ struct v9fs_qid {
 
 /* Plan 9 file metadata (stat) structure */
 struct v9fs_stat {
+	u16 size;
+	u16 type;
+	u32 dev;
+	struct v9fs_qid qid;
+	u32 mode;
+	u32 atime;
+	u32 mtime;
+	u64 length;
+	struct v9fs_str name;
+	struct v9fs_str uid;
+	struct v9fs_str gid;
+	struct v9fs_str muid;
+	struct v9fs_str extension;	/* 9p2000.u extensions */
+	u32 n_uid;		/* 9p2000.u extensions */
+	u32 n_gid;		/* 9p2000.u extensions */
+	u32 n_muid;		/* 9p2000.u extensions */
+};
+
+/* file metadata (stat) structure used to create Twstat message
+   The is similar to v9fs_stat, but the strings don't point to
+   the same memory block and should be freed separately
+*/
+struct v9fs_wstat {
 	u16 size;
 	u16 type;
 	u32 dev;
@@ -131,25 +161,24 @@ struct v9fs_stat {
 	u32 n_uid;		/* 9p2000.u extensions */
 	u32 n_gid;		/* 9p2000.u extensions */
 	u32 n_muid;		/* 9p2000.u extensions */
-	char data[0];
 };
 
 /* Structures for Protocol Operations */
 
 struct Tversion {
 	u32 msize;
-	char *version;
+	struct v9fs_str version;
 };
 
 struct Rversion {
 	u32 msize;
-	char *version;
+	struct v9fs_str version;
 };
 
 struct Tauth {
 	u32 afid;
-	char *uname;
-	char *aname;
+	struct v9fs_str uname;
+	struct v9fs_str aname;
 };
 
 struct Rauth {
@@ -157,12 +186,12 @@ struct Rauth {
 };
 
 struct Rerror {
-	char *error;
+	struct v9fs_str error;
 	u32 errno;		/* 9p2000.u extension */
 };
 
 struct Tflush {
-	u32 oldtag;
+	u16 oldtag;
 };
 
 struct Rflush {
@@ -171,8 +200,8 @@ struct Rflush {
 struct Tattach {
 	u32 fid;
 	u32 afid;
-	char *uname;
-	char *aname;
+	struct v9fs_str uname;
+	struct v9fs_str aname;
 };
 
 struct Rattach {
@@ -182,13 +211,13 @@ struct Rattach {
 struct Twalk {
 	u32 fid;
 	u32 newfid;
-	u32 nwname;
-	char **wnames;
+	u16 nwname;
+	struct v9fs_str wnames[16];
 };
 
 struct Rwalk {
-	u32 nwqid;
-	struct v9fs_qid *wqids;
+	u16 nwqid;
+	struct v9fs_qid wqids[16];
 };
 
 struct Topen {
@@ -203,7 +232,7 @@ struct Ropen {
 
 struct Tcreate {
 	u32 fid;
-	char *name;
+	struct v9fs_str name;
 	u32 perm;
 	u8 mode;
 };
@@ -254,12 +283,12 @@ struct Tstat {
 };
 
 struct Rstat {
-	struct v9fs_stat *stat;
+	struct v9fs_stat stat;
 };
 
 struct Twstat {
 	u32 fid;
-	struct v9fs_stat *stat;
+	struct v9fs_stat stat;
 };
 
 struct Rwstat {
@@ -274,6 +303,7 @@ struct v9fs_fcall {
 	u32 size;
 	u8 id;
 	u16 tag;
+	void *sdata;
 
 	union {
 		struct Tversion tversion;
@@ -306,10 +336,12 @@ struct v9fs_fcall {
 	} params;
 };
 
-#define V9FS_FCALLHDRSZ (sizeof(struct v9fs_fcall) + \
-	sizeof(struct v9fs_stat) + 16*sizeof(struct v9fs_qid) + 16)
+#define PRINT_FCALL_ERROR(s, fcall) dprintk(DEBUG_ERROR, "%s: %.*s\n", s, \
+	fcall?fcall->params.rerror.error.len:0, \
+	fcall?fcall->params.rerror.error.str:"");
 
-#define FCALL_ERROR(fcall) (fcall ? fcall->params.rerror.error : "")
+char *v9fs_str_copy(char *buf, int buflen, struct v9fs_str *str);
+int v9fs_str_compare(char *buf, struct v9fs_str *str);
 
 int v9fs_t_version(struct v9fs_session_info *v9ses, u32 msize,
 		   char *version, struct v9fs_fcall **rcall);
@@ -325,7 +357,7 @@ int v9fs_t_stat(struct v9fs_session_info *v9ses, u32 fid,
 		struct v9fs_fcall **rcall);
 
 int v9fs_t_wstat(struct v9fs_session_info *v9ses, u32 fid,
-		 struct v9fs_stat *stat, struct v9fs_fcall **rcall);
+		 struct v9fs_wstat *wstat, struct v9fs_fcall **rcall);
 
 int v9fs_t_walk(struct v9fs_session_info *v9ses, u32 fid, u32 newfid,
 		char *name, struct v9fs_fcall **rcall);
@@ -343,4 +375,5 @@ int v9fs_t_read(struct v9fs_session_info *v9ses, u32 fid,
 		u64 offset, u32 count, struct v9fs_fcall **rcall);
 
 int v9fs_t_write(struct v9fs_session_info *v9ses, u32 fid, u64 offset,
-		 u32 count, void *data, struct v9fs_fcall **rcall);
+		 u32 count, const char __user * data,
+		 struct v9fs_fcall **rcall);
diff --git a/fs/9p/Makefile b/fs/9p/Makefile
index e4e4ffe5a7..3d02308970 100644
--- a/fs/9p/Makefile
+++ b/fs/9p/Makefile
@@ -1,17 +1,17 @@
 obj-$(CONFIG_9P_FS) := 9p2000.o
 
 9p2000-objs := \
+	trans_fd.o \
+	trans_sock.o \
+	mux.o \
+	9p.o \
+	conv.o \
 	vfs_super.o \
 	vfs_inode.o \
 	vfs_file.o \
 	vfs_dir.o \
 	vfs_dentry.o \
 	error.o \
-	mux.o \
-	trans_fd.o \
-	trans_sock.o \
-	9p.o \
-	conv.o \
 	v9fs.o \
 	fid.o
 
diff --git a/fs/9p/conv.c b/fs/9p/conv.c
index 1b9b15dfea..f62434d435 100644
--- a/fs/9p/conv.c
+++ b/fs/9p/conv.c
@@ -30,7 +30,7 @@
 #include <linux/errno.h>
 #include <linux/fs.h>
 #include <linux/idr.h>
-
+#include <asm/uaccess.h>
 #include "debug.h"
 #include "v9fs.h"
 #include "9p.h"
@@ -45,6 +45,37 @@ struct cbuf {
 	unsigned char *ep;
 };
 
+char *v9fs_str_copy(char *buf, int buflen, struct v9fs_str *str)
+{
+	int n;
+
+	if (buflen < str->len)
+		n = buflen;
+	else
+		n = str->len;
+
+	memmove(buf, str->str, n - 1);
+
+	return buf;
+}
+
+int v9fs_str_compare(char *buf, struct v9fs_str *str)
+{
+	int n, ret;
+
+	ret = strncmp(buf, str->str, str->len);
+
+	if (!ret) {
+		n = strlen(buf);
+		if (n < str->len)
+			ret = -1;
+		else if (n > str->len)
+			ret = 1;
+	}
+
+	return ret;
+}
+
 static inline void buf_init(struct cbuf *buf, void *data, int datalen)
 {
 	buf->sp = buf->p = data;
@@ -58,12 +89,12 @@ static inline int buf_check_overflow(struct cbuf *buf)
 
 static inline int buf_check_size(struct cbuf *buf, int len)
 {
-	if (buf->p+len > buf->ep) {
-		if (buf->p < buf->ep) {
-			eprintk(KERN_ERR, "buffer overflow\n");
-			buf->p = buf->ep + 1;
-			return 0;
-		}
+	if (buf->p + len > buf->ep && buf->p < buf->ep) {
+		eprintk(KERN_ERR, "buffer overflow: want %d has %d\n",
+			len, (int)(buf->ep - buf->p));
+		dump_stack();
+		buf->p = buf->ep + 1;
+		return 0;
 	}
 
 	return 1;
@@ -127,14 +158,6 @@ static inline void buf_put_string(struct cbuf *buf, const char *s)
 	buf_put_stringn(buf, s, strlen(s));
 }
 
-static inline void buf_put_data(struct cbuf *buf, void *data, u32 datalen)
-{
-	if (buf_check_size(buf, datalen)) {
-		memcpy(buf->p, data, datalen);
-		buf->p += datalen;
-	}
-}
-
 static inline u8 buf_get_int8(struct cbuf *buf)
 {
 	u8 ret = 0;
@@ -183,85 +206,37 @@ static inline u64 buf_get_int64(struct cbuf *buf)
 	return ret;
 }
 
-static inline int
-buf_get_string(struct cbuf *buf, char *data, unsigned int datalen)
-{
-	u16 len = 0;
-
-	len = buf_get_int16(buf);
-	if (!buf_check_overflow(buf) && buf_check_size(buf, len) && len+1>datalen) {
-		memcpy(data, buf->p, len);
-		data[len] = 0;
-		buf->p += len;
-		len++;
-	}
-
-	return len;
-}
-
-static inline char *buf_get_stringb(struct cbuf *buf, struct cbuf *sbuf)
-{
-	char *ret;
-	u16 len;
-
-	ret = NULL;
-	len = buf_get_int16(buf);
-
-	if (!buf_check_overflow(buf) && buf_check_size(buf, len) &&
-		buf_check_size(sbuf, len + 1)) {
-
-		memcpy(sbuf->p, buf->p, len);
-		sbuf->p[len] = 0;
-		ret = sbuf->p;
-		buf->p += len;
-		sbuf->p += len + 1;
-	}
-
-	return ret;
-}
-
-static inline int buf_get_data(struct cbuf *buf, void *data, int datalen)
+static inline void buf_get_str(struct cbuf *buf, struct v9fs_str *vstr)
 {
-	int ret = 0;
-
-	if (buf_check_size(buf, datalen)) {
-		memcpy(data, buf->p, datalen);
-		buf->p += datalen;
-		ret = datalen;
+	vstr->len = buf_get_int16(buf);
+	if (!buf_check_overflow(buf) && buf_check_size(buf, vstr->len)) {
+		vstr->str = buf->p;
+		buf->p += vstr->len;
+	} else {
+		vstr->len = 0;
+		vstr->str = NULL;
 	}
-
-	return ret;
 }
 
-static inline void *buf_get_datab(struct cbuf *buf, struct cbuf *dbuf,
-				  int datalen)
+static inline void buf_get_qid(struct cbuf *bufp, struct v9fs_qid *qid)
 {
-	char *ret = NULL;
-	int n = 0;
-
-	if (buf_check_size(dbuf, datalen)) {
-		n = buf_get_data(buf, dbuf->p, datalen);
-		if (n > 0) {
-			ret = dbuf->p;
-			dbuf->p += n;
-		}
-	}
-
-	return ret;
+	qid->type = buf_get_int8(bufp);
+	qid->version = buf_get_int32(bufp);
+	qid->path = buf_get_int64(bufp);
 }
 
 /**
- * v9fs_size_stat - calculate the size of a variable length stat struct
+ * v9fs_size_wstat - calculate the size of a variable length stat struct
  * @stat: metadata (stat) structure
  * @extended: non-zero if 9P2000.u
  *
  */
 
-static int v9fs_size_stat(struct v9fs_stat *stat, int extended)
+static int v9fs_size_wstat(struct v9fs_wstat *wstat, int extended)
 {
 	int size = 0;
 
-	if (stat == NULL) {
+	if (wstat == NULL) {
 		eprintk(KERN_ERR, "v9fs_size_stat: got a NULL stat pointer\n");
 		return 0;
 	}
@@ -278,81 +253,38 @@ static int v9fs_size_stat(struct v9fs_stat *stat, int extended)
 	    8 +			/* length[8] */
 	    8;			/* minimum sum of string lengths */
 
-	if (stat->name)
-		size += strlen(stat->name);
-	if (stat->uid)
-		size += strlen(stat->uid);
-	if (stat->gid)
-		size += strlen(stat->gid);
-	if (stat->muid)
-		size += strlen(stat->muid);
+	if (wstat->name)
+		size += strlen(wstat->name);
+	if (wstat->uid)
+		size += strlen(wstat->uid);
+	if (wstat->gid)
+		size += strlen(wstat->gid);
+	if (wstat->muid)
+		size += strlen(wstat->muid);
 
 	if (extended) {
 		size += 4 +	/* n_uid[4] */
 		    4 +		/* n_gid[4] */
 		    4 +		/* n_muid[4] */
 		    2;		/* string length of extension[4] */
-		if (stat->extension)
-			size += strlen(stat->extension);
+		if (wstat->extension)
+			size += strlen(wstat->extension);
 	}
 
 	return size;
 }
 
 /**
- * serialize_stat - safely format a stat structure for transmission
- * @stat: metadata (stat) structure
- * @bufp: buffer to serialize structure into
- * @extended: non-zero if 9P2000.u
- *
- */
-
-static int
-serialize_stat(struct v9fs_stat *stat, struct cbuf *bufp, int extended)
-{
-	buf_put_int16(bufp, stat->size);
-	buf_put_int16(bufp, stat->type);
-	buf_put_int32(bufp, stat->dev);
-	buf_put_int8(bufp, stat->qid.type);
-	buf_put_int32(bufp, stat->qid.version);
-	buf_put_int64(bufp, stat->qid.path);
-	buf_put_int32(bufp, stat->mode);
-	buf_put_int32(bufp, stat->atime);
-	buf_put_int32(bufp, stat->mtime);
-	buf_put_int64(bufp, stat->length);
-
-	buf_put_string(bufp, stat->name);
-	buf_put_string(bufp, stat->uid);
-	buf_put_string(bufp, stat->gid);
-	buf_put_string(bufp, stat->muid);
-
-	if (extended) {
-		buf_put_string(bufp, stat->extension);
-		buf_put_int32(bufp, stat->n_uid);
-		buf_put_int32(bufp, stat->n_gid);
-		buf_put_int32(bufp, stat->n_muid);
-	}
-
-	if (buf_check_overflow(bufp))
-		return 0;
-
-	return stat->size;
-}
-
-/**
- * deserialize_stat - safely decode a recieved metadata (stat) structure
+ * buf_get_stat - safely decode a recieved metadata (stat) structure
  * @bufp: buffer to deserialize
  * @stat: metadata (stat) structure
- * @dbufp: buffer to deserialize variable strings into
  * @extended: non-zero if 9P2000.u
  *
  */
 
-static inline int
-deserialize_stat(struct cbuf *bufp, struct v9fs_stat *stat,
-		 struct cbuf *dbufp, int extended)
+static inline void
+buf_get_stat(struct cbuf *bufp, struct v9fs_stat *stat, int extended)
 {
-
 	stat->size = buf_get_int16(bufp);
 	stat->type = buf_get_int16(bufp);
 	stat->dev = buf_get_int32(bufp);
@@ -363,45 +295,17 @@ deserialize_stat(struct cbuf *bufp, struct v9fs_stat *stat,
 	stat->atime = buf_get_int32(bufp);
 	stat->mtime = buf_get_int32(bufp);
 	stat->length = buf_get_int64(bufp);
-	stat->name = buf_get_stringb(bufp, dbufp);
-	stat->uid = buf_get_stringb(bufp, dbufp);
-	stat->gid = buf_get_stringb(bufp, dbufp);
-	stat->muid = buf_get_stringb(bufp, dbufp);
+	buf_get_str(bufp, &stat->name);
+	buf_get_str(bufp, &stat->uid);
+	buf_get_str(bufp, &stat->gid);
+	buf_get_str(bufp, &stat->muid);
 
 	if (extended) {
-		stat->extension = buf_get_stringb(bufp, dbufp);
+		buf_get_str(bufp, &stat->extension);
 		stat->n_uid = buf_get_int32(bufp);
 		stat->n_gid = buf_get_int32(bufp);
 		stat->n_muid = buf_get_int32(bufp);
 	}
-
-	if (buf_check_overflow(bufp) || buf_check_overflow(dbufp))
-		return 0;
-
-	return stat->size + 2;
-}
-
-/**
- * deserialize_statb - wrapper for decoding a received metadata structure
- * @bufp: buffer to deserialize
- * @dbufp: buffer to deserialize variable strings into
- * @extended: non-zero if 9P2000.u
- *
- */
-
-static inline struct v9fs_stat *deserialize_statb(struct cbuf *bufp,
-						  struct cbuf *dbufp,
-						  int extended)
-{
-	struct v9fs_stat *ret = buf_alloc(dbufp, sizeof(struct v9fs_stat));
-
-	if (ret) {
-		int n = deserialize_stat(bufp, ret, dbufp, extended);
-		if (n <= 0)
-			return NULL;
-	}
-
-	return ret;
 }
 
 /**
@@ -409,194 +313,27 @@ static inline struct v9fs_stat *deserialize_statb(struct cbuf *bufp,
  * @buf: buffer to deserialize
  * @buflen: length of received buffer
  * @stat: metadata structure to decode into
- * @statlen: length of destination metadata structure
  * @extended: non-zero if 9P2000.u
  *
+ * Note: stat will point to the buf region.
  */
 
-int v9fs_deserialize_stat(void *buf, u32 buflen, struct v9fs_stat *stat,
-			  u32 statlen, int extended)
+int
+v9fs_deserialize_stat(void *buf, u32 buflen, struct v9fs_stat *stat,
+		int extended)
 {
 	struct cbuf buffer;
 	struct cbuf *bufp = &buffer;
-	struct cbuf dbuffer;
-	struct cbuf *dbufp = &dbuffer;
+	unsigned char *p;
 
 	buf_init(bufp, buf, buflen);
-	buf_init(dbufp, (char *)stat + sizeof(struct v9fs_stat),
-		 statlen - sizeof(struct v9fs_stat));
-
-	return deserialize_stat(bufp, stat, dbufp, extended);
-}
-
-static inline int v9fs_size_fcall(struct v9fs_fcall *fcall, int extended)
-{
-	int size = 4 + 1 + 2;	/* size[4] msg[1] tag[2] */
-	int i = 0;
+	p = bufp->p;
+	buf_get_stat(bufp, stat, extended);
 
-	switch (fcall->id) {
-	default:
-		eprintk(KERN_ERR, "bad msg type %d\n", fcall->id);
+	if (buf_check_overflow(bufp))
 		return 0;
-	case TVERSION:		/* msize[4] version[s] */
-		size += 4 + 2 + strlen(fcall->params.tversion.version);
-		break;
-	case TAUTH:		/* afid[4] uname[s] aname[s] */
-		size += 4 + 2 + strlen(fcall->params.tauth.uname) +
-		    2 + strlen(fcall->params.tauth.aname);
-		break;
-	case TFLUSH:		/* oldtag[2] */
-		size += 2;
-		break;
-	case TATTACH:		/* fid[4] afid[4] uname[s] aname[s] */
-		size += 4 + 4 + 2 + strlen(fcall->params.tattach.uname) +
-		    2 + strlen(fcall->params.tattach.aname);
-		break;
-	case TWALK:		/* fid[4] newfid[4] nwname[2] nwname*(wname[s]) */
-		size += 4 + 4 + 2;
-		/* now compute total for the array of names */
-		for (i = 0; i < fcall->params.twalk.nwname; i++)
-			size += 2 + strlen(fcall->params.twalk.wnames[i]);
-		break;
-	case TOPEN:		/* fid[4] mode[1] */
-		size += 4 + 1;
-		break;
-	case TCREATE:		/* fid[4] name[s] perm[4] mode[1] */
-		size += 4 + 2 + strlen(fcall->params.tcreate.name) + 4 + 1;
-		break;
-	case TREAD:		/* fid[4] offset[8] count[4] */
-		size += 4 + 8 + 4;
-		break;
-	case TWRITE:		/* fid[4] offset[8] count[4] data[count] */
-		size += 4 + 8 + 4 + fcall->params.twrite.count;
-		break;
-	case TCLUNK:		/* fid[4] */
-		size += 4;
-		break;
-	case TREMOVE:		/* fid[4] */
-		size += 4;
-		break;
-	case TSTAT:		/* fid[4] */
-		size += 4;
-		break;
-	case TWSTAT:		/* fid[4] stat[n] */
-		fcall->params.twstat.stat->size =
-		    v9fs_size_stat(fcall->params.twstat.stat, extended);
-		size += 4 + 2 + 2 + fcall->params.twstat.stat->size;
-	}
-	return size;
-}
-
-/*
- * v9fs_serialize_fcall - marshall fcall struct into a packet
- * @fcall: structure to convert
- * @data: buffer to serialize fcall into
- * @datalen: length of buffer to serialize fcall into
- * @extended: non-zero if 9P2000.u
- *
- */
-
-int
-v9fs_serialize_fcall(struct v9fs_fcall *fcall, void *data, u32 datalen,
-		     int extended)
-{
-	int i = 0;
-	struct v9fs_stat *stat = NULL;
-	struct cbuf buffer;
-	struct cbuf *bufp = &buffer;
-
-	buf_init(bufp, data, datalen);
-
-	if (!fcall) {
-		eprintk(KERN_ERR, "no fcall\n");
-		return -EINVAL;
-	}
-
-	fcall->size = v9fs_size_fcall(fcall, extended);
-
-	buf_put_int32(bufp, fcall->size);
-	buf_put_int8(bufp, fcall->id);
-	buf_put_int16(bufp, fcall->tag);
-
-	dprintk(DEBUG_CONV, "size %d id %d tag %d\n", fcall->size, fcall->id,
-		fcall->tag);
-
-	/* now encode it */
-	switch (fcall->id) {
-	default:
-		eprintk(KERN_ERR, "bad msg type: %d\n", fcall->id);
-		return -EPROTO;
-	case TVERSION:
-		buf_put_int32(bufp, fcall->params.tversion.msize);
-		buf_put_string(bufp, fcall->params.tversion.version);
-		break;
-	case TAUTH:
-		buf_put_int32(bufp, fcall->params.tauth.afid);
-		buf_put_string(bufp, fcall->params.tauth.uname);
-		buf_put_string(bufp, fcall->params.tauth.aname);
-		break;
-	case TFLUSH:
-		buf_put_int16(bufp, fcall->params.tflush.oldtag);
-		break;
-	case TATTACH:
-		buf_put_int32(bufp, fcall->params.tattach.fid);
-		buf_put_int32(bufp, fcall->params.tattach.afid);
-		buf_put_string(bufp, fcall->params.tattach.uname);
-		buf_put_string(bufp, fcall->params.tattach.aname);
-		break;
-	case TWALK:
-		buf_put_int32(bufp, fcall->params.twalk.fid);
-		buf_put_int32(bufp, fcall->params.twalk.newfid);
-		buf_put_int16(bufp, fcall->params.twalk.nwname);
-		for (i = 0; i < fcall->params.twalk.nwname; i++)
-			buf_put_string(bufp, fcall->params.twalk.wnames[i]);
-		break;
-	case TOPEN:
-		buf_put_int32(bufp, fcall->params.topen.fid);
-		buf_put_int8(bufp, fcall->params.topen.mode);
-		break;
-	case TCREATE:
-		buf_put_int32(bufp, fcall->params.tcreate.fid);
-		buf_put_string(bufp, fcall->params.tcreate.name);
-		buf_put_int32(bufp, fcall->params.tcreate.perm);
-		buf_put_int8(bufp, fcall->params.tcreate.mode);
-		break;
-	case TREAD:
-		buf_put_int32(bufp, fcall->params.tread.fid);
-		buf_put_int64(bufp, fcall->params.tread.offset);
-		buf_put_int32(bufp, fcall->params.tread.count);
-		break;
-	case TWRITE:
-		buf_put_int32(bufp, fcall->params.twrite.fid);
-		buf_put_int64(bufp, fcall->params.twrite.offset);
-		buf_put_int32(bufp, fcall->params.twrite.count);
-		buf_put_data(bufp, fcall->params.twrite.data,
-			     fcall->params.twrite.count);
-		break;
-	case TCLUNK:
-		buf_put_int32(bufp, fcall->params.tclunk.fid);
-		break;
-	case TREMOVE:
-		buf_put_int32(bufp, fcall->params.tremove.fid);
-		break;
-	case TSTAT:
-		buf_put_int32(bufp, fcall->params.tstat.fid);
-		break;
-	case TWSTAT:
-		buf_put_int32(bufp, fcall->params.twstat.fid);
-		stat = fcall->params.twstat.stat;
-
-		buf_put_int16(bufp, stat->size + 2);
-		serialize_stat(stat, bufp, extended);
-		break;
-	}
-
-	if (buf_check_overflow(bufp)) {
-		dprintk(DEBUG_ERROR, "buffer overflow\n");
-		return -EIO;
-	}
-
-	return fcall->size;
+	else
+		return bufp->p - p;
 }
 
 /**
@@ -611,18 +348,14 @@ v9fs_serialize_fcall(struct v9fs_fcall *fcall, void *data, u32 datalen,
 
 int
 v9fs_deserialize_fcall(void *buf, u32 buflen, struct v9fs_fcall *rcall,
-		       int rcalllen, int extended)
+		       int extended)
 {
 
 	struct cbuf buffer;
 	struct cbuf *bufp = &buffer;
-	struct cbuf dbuffer;
-	struct cbuf *dbufp = &dbuffer;
 	int i = 0;
 
 	buf_init(bufp, buf, buflen);
-	buf_init(dbufp, (char *)rcall + sizeof(struct v9fs_fcall),
-		 rcalllen - sizeof(struct v9fs_fcall));
 
 	rcall->size = buf_get_int32(bufp);
 	rcall->id = buf_get_int8(bufp);
@@ -630,13 +363,14 @@ v9fs_deserialize_fcall(void *buf, u32 buflen, struct v9fs_fcall *rcall,
 
 	dprintk(DEBUG_CONV, "size %d id %d tag %d\n", rcall->size, rcall->id,
 		rcall->tag);
+
 	switch (rcall->id) {
 	default:
 		eprintk(KERN_ERR, "unknown message type: %d\n", rcall->id);
 		return -EPROTO;
 	case RVERSION:
 		rcall->params.rversion.msize = buf_get_int32(bufp);
-		rcall->params.rversion.version = buf_get_stringb(bufp, dbufp);
+		buf_get_str(bufp, &rcall->params.rversion.version);
 		break;
 	case RFLUSH:
 		break;
@@ -647,40 +381,27 @@ v9fs_deserialize_fcall(void *buf, u32 buflen, struct v9fs_fcall *rcall,
 		break;
 	case RWALK:
 		rcall->params.rwalk.nwqid = buf_get_int16(bufp);
-		if (rcall->params.rwalk.nwqid > 16) {
-			eprintk(KERN_ERR, "Rwalk with more than 16 qids: %d\n",
-				rcall->params.rwalk.nwqid);
+		if (rcall->params.rwalk.nwqid > V9FS_MAXWELEM) {
+			eprintk(KERN_ERR, "Rwalk with more than %d qids: %d\n",
+				V9FS_MAXWELEM, rcall->params.rwalk.nwqid);
 			return -EPROTO;
 		}
 
-		rcall->params.rwalk.wqids = buf_alloc(dbufp,
-		      rcall->params.rwalk.nwqid * sizeof(struct v9fs_qid));
-		if (rcall->params.rwalk.wqids)
-			for (i = 0; i < rcall->params.rwalk.nwqid; i++) {
-				rcall->params.rwalk.wqids[i].type =
-				    buf_get_int8(bufp);
-				rcall->params.rwalk.wqids[i].version =
-				    buf_get_int16(bufp);
-				rcall->params.rwalk.wqids[i].path =
-				    buf_get_int64(bufp);
-			}
+		for (i = 0; i < rcall->params.rwalk.nwqid; i++)
+			buf_get_qid(bufp, &rcall->params.rwalk.wqids[i]);
 		break;
 	case ROPEN:
-		rcall->params.ropen.qid.type = buf_get_int8(bufp);
-		rcall->params.ropen.qid.version = buf_get_int32(bufp);
-		rcall->params.ropen.qid.path = buf_get_int64(bufp);
+		buf_get_qid(bufp, &rcall->params.ropen.qid);
 		rcall->params.ropen.iounit = buf_get_int32(bufp);
 		break;
 	case RCREATE:
-		rcall->params.rcreate.qid.type = buf_get_int8(bufp);
-		rcall->params.rcreate.qid.version = buf_get_int32(bufp);
-		rcall->params.rcreate.qid.path = buf_get_int64(bufp);
+		buf_get_qid(bufp, &rcall->params.rcreate.qid);
 		rcall->params.rcreate.iounit = buf_get_int32(bufp);
 		break;
 	case RREAD:
 		rcall->params.rread.count = buf_get_int32(bufp);
-		rcall->params.rread.data = buf_get_datab(bufp, dbufp,
-			rcall->params.rread.count);
+		rcall->params.rread.data = bufp->p;
+		buf_check_size(bufp, rcall->params.rread.count);
 		break;
 	case RWRITE:
 		rcall->params.rwrite.count = buf_get_int32(bufp);
@@ -691,22 +412,442 @@ v9fs_deserialize_fcall(void *buf, u32 buflen, struct v9fs_fcall *rcall,
 		break;
 	case RSTAT:
 		buf_get_int16(bufp);
-		rcall->params.rstat.stat =
-		    deserialize_statb(bufp, dbufp, extended);
+		buf_get_stat(bufp, &rcall->params.rstat.stat, extended);
 		break;
 	case RWSTAT:
 		break;
 	case RERROR:
-		rcall->params.rerror.error = buf_get_stringb(bufp, dbufp);
+		buf_get_str(bufp, &rcall->params.rerror.error);
 		if (extended)
 			rcall->params.rerror.errno = buf_get_int16(bufp);
 		break;
 	}
 
-	if (buf_check_overflow(bufp) || buf_check_overflow(dbufp)) {
+	if (buf_check_overflow(bufp)) {
 		dprintk(DEBUG_ERROR, "buffer overflow\n");
 		return -EIO;
 	}
 
-	return rcall->size;
+	return bufp->p - bufp->sp;
+}
+
+static inline void v9fs_put_int8(struct cbuf *bufp, u8 val, u8 * p)
+{
+	*p = val;
+	buf_put_int8(bufp, val);
+}
+
+static inline void v9fs_put_int16(struct cbuf *bufp, u16 val, u16 * p)
+{
+	*p = val;
+	buf_put_int16(bufp, val);
+}
+
+static inline void v9fs_put_int32(struct cbuf *bufp, u32 val, u32 * p)
+{
+	*p = val;
+	buf_put_int32(bufp, val);
+}
+
+static inline void v9fs_put_int64(struct cbuf *bufp, u64 val, u64 * p)
+{
+	*p = val;
+	buf_put_int64(bufp, val);
+}
+
+static inline void
+v9fs_put_str(struct cbuf *bufp, char *data, struct v9fs_str *str)
+{
+	if (data) {
+		str->len = strlen(data);
+		str->str = bufp->p;
+	} else {
+		str->len = 0;
+		str->str = NULL;
+	}
+
+	buf_put_stringn(bufp, data, str->len);
+}
+
+static inline int
+v9fs_put_user_data(struct cbuf *bufp, const char __user * data, int count,
+		   unsigned char **pdata)
+{
+	*pdata = buf_alloc(bufp, count);
+	return copy_from_user(*pdata, data, count);
+}
+
+static void
+v9fs_put_wstat(struct cbuf *bufp, struct v9fs_wstat *wstat,
+	       struct v9fs_stat *stat, int statsz, int extended)
+{
+	v9fs_put_int16(bufp, statsz, &stat->size);
+	v9fs_put_int16(bufp, wstat->type, &stat->type);
+	v9fs_put_int32(bufp, wstat->dev, &stat->dev);
+	v9fs_put_int8(bufp, wstat->qid.type, &stat->qid.type);
+	v9fs_put_int32(bufp, wstat->qid.version, &stat->qid.version);
+	v9fs_put_int64(bufp, wstat->qid.path, &stat->qid.path);
+	v9fs_put_int32(bufp, wstat->mode, &stat->mode);
+	v9fs_put_int32(bufp, wstat->atime, &stat->atime);
+	v9fs_put_int32(bufp, wstat->mtime, &stat->mtime);
+	v9fs_put_int64(bufp, wstat->length, &stat->length);
+
+	v9fs_put_str(bufp, wstat->name, &stat->name);
+	v9fs_put_str(bufp, wstat->uid, &stat->uid);
+	v9fs_put_str(bufp, wstat->gid, &stat->gid);
+	v9fs_put_str(bufp, wstat->muid, &stat->muid);
+
+	if (extended) {
+		v9fs_put_str(bufp, wstat->extension, &stat->extension);
+		v9fs_put_int32(bufp, wstat->n_uid, &stat->n_uid);
+		v9fs_put_int32(bufp, wstat->n_gid, &stat->n_gid);
+		v9fs_put_int32(bufp, wstat->n_muid, &stat->n_muid);
+	}
+}
+
+static struct v9fs_fcall *
+v9fs_create_common(struct cbuf *bufp, u32 size, u8 id)
+{
+	struct v9fs_fcall *fc;
+
+	size += 4 + 1 + 2;	/* size[4] id[1] tag[2] */
+	fc = kmalloc(sizeof(struct v9fs_fcall) + size, GFP_KERNEL);
+	if (!fc)
+		return ERR_PTR(-ENOMEM);
+
+	fc->sdata = (char *)fc + sizeof(*fc);
+
+	buf_init(bufp, (char *)fc->sdata, size);
+	v9fs_put_int32(bufp, size, &fc->size);
+	v9fs_put_int8(bufp, id, &fc->id);
+	v9fs_put_int16(bufp, V9FS_NOTAG, &fc->tag);
+
+	return fc;
+}
+
+void v9fs_set_tag(struct v9fs_fcall *fc, u16 tag)
+{
+	*(__le16 *) (fc->sdata + 5) = cpu_to_le16(tag);
+}
+
+struct v9fs_fcall *v9fs_create_tversion(u32 msize, char *version)
+{
+	int size;
+	struct v9fs_fcall *fc;
+	struct cbuf buffer;
+	struct cbuf *bufp = &buffer;
+
+	size = 4 + 2 + strlen(version);	/* msize[4] version[s] */
+	fc = v9fs_create_common(bufp, size, TVERSION);
+	if (IS_ERR(fc))
+		goto error;
+
+	v9fs_put_int32(bufp, msize, &fc->params.tversion.msize);
+	v9fs_put_str(bufp, version, &fc->params.tversion.version);
+
+	if (buf_check_overflow(bufp)) {
+		kfree(fc);
+		fc = ERR_PTR(-ENOMEM);
+	}
+      error:
+	return fc;
+}
+
+struct v9fs_fcall *v9fs_create_tauth(u32 afid, char *uname, char *aname)
+{
+	int size;
+	struct v9fs_fcall *fc;
+	struct cbuf buffer;
+	struct cbuf *bufp = &buffer;
+
+	size = 4 + 2 + strlen(uname) + 2 + strlen(aname);	/* afid[4] uname[s] aname[s] */
+	fc = v9fs_create_common(bufp, size, TAUTH);
+	if (IS_ERR(fc))
+		goto error;
+
+	v9fs_put_int32(bufp, afid, &fc->params.tauth.afid);
+	v9fs_put_str(bufp, uname, &fc->params.tauth.uname);
+	v9fs_put_str(bufp, aname, &fc->params.tauth.aname);
+
+	if (buf_check_overflow(bufp)) {
+		kfree(fc);
+		fc = ERR_PTR(-ENOMEM);
+	}
+      error:
+	return fc;
+}
+
+struct v9fs_fcall *
+v9fs_create_tattach(u32 fid, u32 afid, char *uname, char *aname)
+{
+	int size;
+	struct v9fs_fcall *fc;
+	struct cbuf buffer;
+	struct cbuf *bufp = &buffer;
+
+	size = 4 + 4 + 2 + strlen(uname) + 2 + strlen(aname);	/* fid[4] afid[4] uname[s] aname[s] */
+	fc = v9fs_create_common(bufp, size, TATTACH);
+	if (IS_ERR(fc))
+		goto error;
+
+	v9fs_put_int32(bufp, fid, &fc->params.tattach.fid);
+	v9fs_put_int32(bufp, afid, &fc->params.tattach.afid);
+	v9fs_put_str(bufp, uname, &fc->params.tattach.uname);
+	v9fs_put_str(bufp, aname, &fc->params.tattach.aname);
+
+      error:
+	return fc;
+}
+
+struct v9fs_fcall *v9fs_create_tflush(u16 oldtag)
+{
+	int size;
+	struct v9fs_fcall *fc;
+	struct cbuf buffer;
+	struct cbuf *bufp = &buffer;
+
+	size = 2;		/* oldtag[2] */
+	fc = v9fs_create_common(bufp, size, TFLUSH);
+	if (IS_ERR(fc))
+		goto error;
+
+	v9fs_put_int16(bufp, oldtag, &fc->params.tflush.oldtag);
+
+	if (buf_check_overflow(bufp)) {
+		kfree(fc);
+		fc = ERR_PTR(-ENOMEM);
+	}
+      error:
+	return fc;
+}
+
+struct v9fs_fcall *v9fs_create_twalk(u32 fid, u32 newfid, u16 nwname,
+				     char **wnames)
+{
+	int i, size;
+	struct v9fs_fcall *fc;
+	struct cbuf buffer;
+	struct cbuf *bufp = &buffer;
+
+	if (nwname > V9FS_MAXWELEM) {
+		dprintk(DEBUG_ERROR, "nwname > %d\n", V9FS_MAXWELEM);
+		return NULL;
+	}
+
+	size = 4 + 4 + 2;	/* fid[4] newfid[4] nwname[2] ... */
+	for (i = 0; i < nwname; i++) {
+		size += 2 + strlen(wnames[i]);	/* wname[s] */
+	}
+
+	fc = v9fs_create_common(bufp, size, TWALK);
+	if (IS_ERR(fc))
+		goto error;
+
+	v9fs_put_int32(bufp, fid, &fc->params.twalk.fid);
+	v9fs_put_int32(bufp, newfid, &fc->params.twalk.newfid);
+	v9fs_put_int16(bufp, nwname, &fc->params.twalk.nwname);
+	for (i = 0; i < nwname; i++) {
+		v9fs_put_str(bufp, wnames[i], &fc->params.twalk.wnames[i]);
+	}
+
+	if (buf_check_overflow(bufp)) {
+		kfree(fc);
+		fc = ERR_PTR(-ENOMEM);
+	}
+      error:
+	return fc;
+}
+
+struct v9fs_fcall *v9fs_create_topen(u32 fid, u8 mode)
+{
+	int size;
+	struct v9fs_fcall *fc;
+	struct cbuf buffer;
+	struct cbuf *bufp = &buffer;
+
+	size = 4 + 1;		/* fid[4] mode[1] */
+	fc = v9fs_create_common(bufp, size, TOPEN);
+	if (IS_ERR(fc))
+		goto error;
+
+	v9fs_put_int32(bufp, fid, &fc->params.topen.fid);
+	v9fs_put_int8(bufp, mode, &fc->params.topen.mode);
+
+	if (buf_check_overflow(bufp)) {
+		kfree(fc);
+		fc = ERR_PTR(-ENOMEM);
+	}
+      error:
+	return fc;
+}
+
+struct v9fs_fcall *v9fs_create_tcreate(u32 fid, char *name, u32 perm, u8 mode)
+{
+	int size;
+	struct v9fs_fcall *fc;
+	struct cbuf buffer;
+	struct cbuf *bufp = &buffer;
+
+	size = 4 + 2 + strlen(name) + 4 + 1;	/* fid[4] name[s] perm[4] mode[1] */
+	fc = v9fs_create_common(bufp, size, TCREATE);
+	if (IS_ERR(fc))
+		goto error;
+
+	v9fs_put_int32(bufp, fid, &fc->params.tcreate.fid);
+	v9fs_put_str(bufp, name, &fc->params.tcreate.name);
+	v9fs_put_int32(bufp, perm, &fc->params.tcreate.perm);
+	v9fs_put_int8(bufp, mode, &fc->params.tcreate.mode);
+
+	if (buf_check_overflow(bufp)) {
+		kfree(fc);
+		fc = ERR_PTR(-ENOMEM);
+	}
+      error:
+	return fc;
+}
+
+struct v9fs_fcall *v9fs_create_tread(u32 fid, u64 offset, u32 count)
+{
+	int size;
+	struct v9fs_fcall *fc;
+	struct cbuf buffer;
+	struct cbuf *bufp = &buffer;
+
+	size = 4 + 8 + 4;	/* fid[4] offset[8] count[4] */
+	fc = v9fs_create_common(bufp, size, TREAD);
+	if (IS_ERR(fc))
+		goto error;
+
+	v9fs_put_int32(bufp, fid, &fc->params.tread.fid);
+	v9fs_put_int64(bufp, offset, &fc->params.tread.offset);
+	v9fs_put_int32(bufp, count, &fc->params.tread.count);
+
+	if (buf_check_overflow(bufp)) {
+		kfree(fc);
+		fc = ERR_PTR(-ENOMEM);
+	}
+      error:
+	return fc;
+}
+
+struct v9fs_fcall *v9fs_create_twrite(u32 fid, u64 offset, u32 count,
+				      const char __user * data)
+{
+	int size, err;
+	struct v9fs_fcall *fc;
+	struct cbuf buffer;
+	struct cbuf *bufp = &buffer;
+
+	size = 4 + 8 + 4 + count;	/* fid[4] offset[8] count[4] data[count] */
+	fc = v9fs_create_common(bufp, size, TWRITE);
+	if (IS_ERR(fc))
+		goto error;
+
+	v9fs_put_int32(bufp, fid, &fc->params.twrite.fid);
+	v9fs_put_int64(bufp, offset, &fc->params.twrite.offset);
+	v9fs_put_int32(bufp, count, &fc->params.twrite.count);
+	err = v9fs_put_user_data(bufp, data, count, &fc->params.twrite.data);
+	if (err) {
+		kfree(fc);
+		fc = ERR_PTR(err);
+	}
+
+	if (buf_check_overflow(bufp)) {
+		kfree(fc);
+		fc = ERR_PTR(-ENOMEM);
+	}
+      error:
+	return fc;
+}
+
+struct v9fs_fcall *v9fs_create_tclunk(u32 fid)
+{
+	int size;
+	struct v9fs_fcall *fc;
+	struct cbuf buffer;
+	struct cbuf *bufp = &buffer;
+
+	size = 4;		/* fid[4] */
+	fc = v9fs_create_common(bufp, size, TCLUNK);
+	if (IS_ERR(fc))
+		goto error;
+
+	v9fs_put_int32(bufp, fid, &fc->params.tclunk.fid);
+
+	if (buf_check_overflow(bufp)) {
+		kfree(fc);
+		fc = ERR_PTR(-ENOMEM);
+	}
+      error:
+	return fc;
+}
+
+struct v9fs_fcall *v9fs_create_tremove(u32 fid)
+{
+	int size;
+	struct v9fs_fcall *fc;
+	struct cbuf buffer;
+	struct cbuf *bufp = &buffer;
+
+	size = 4;		/* fid[4] */
+	fc = v9fs_create_common(bufp, size, TREMOVE);
+	if (IS_ERR(fc))
+		goto error;
+
+	v9fs_put_int32(bufp, fid, &fc->params.tremove.fid);
+
+	if (buf_check_overflow(bufp)) {
+		kfree(fc);
+		fc = ERR_PTR(-ENOMEM);
+	}
+      error:
+	return fc;
+}
+
+struct v9fs_fcall *v9fs_create_tstat(u32 fid)
+{
+	int size;
+	struct v9fs_fcall *fc;
+	struct cbuf buffer;
+	struct cbuf *bufp = &buffer;
+
+	size = 4;		/* fid[4] */
+	fc = v9fs_create_common(bufp, size, TSTAT);
+	if (IS_ERR(fc))
+		goto error;
+
+	v9fs_put_int32(bufp, fid, &fc->params.tstat.fid);
+
+	if (buf_check_overflow(bufp)) {
+		kfree(fc);
+		fc = ERR_PTR(-ENOMEM);
+	}
+      error:
+	return fc;
+}
+
+struct v9fs_fcall *v9fs_create_twstat(u32 fid, struct v9fs_wstat *wstat,
+				      int extended)
+{
+	int size, statsz;
+	struct v9fs_fcall *fc;
+	struct cbuf buffer;
+	struct cbuf *bufp = &buffer;
+
+	statsz = v9fs_size_wstat(wstat, extended);
+	size = 4 + 2 + 2 + statsz;	/* fid[4] stat[n] */
+	fc = v9fs_create_common(bufp, size, TWSTAT);
+	if (IS_ERR(fc))
+		goto error;
+
+	v9fs_put_int32(bufp, fid, &fc->params.twstat.fid);
+	buf_put_int16(bufp, statsz + 2);
+	v9fs_put_wstat(bufp, wstat, &fc->params.twstat.stat, statsz, extended);
+
+	if (buf_check_overflow(bufp)) {
+		kfree(fc);
+		fc = ERR_PTR(-ENOMEM);
+	}
+      error:
+	return fc;
 }
diff --git a/fs/9p/conv.h b/fs/9p/conv.h
index d5e33e17a6..26a736e4a2 100644
--- a/fs/9p/conv.h
+++ b/fs/9p/conv.h
@@ -1,8 +1,9 @@
 /*
  * linux/fs/9p/conv.h
  *
- * 9P protocol conversion definitions
+ * 9P protocol conversion definitions.
  *
+ *  Copyright (C) 2005 by Latchesar Ionkov <lucho@ionkov.net>
  *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
  *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
  *
@@ -25,11 +26,26 @@
  */
 
 int v9fs_deserialize_stat(void *buf, u32 buflen, struct v9fs_stat *stat,
-	u32 statlen, int extended);
-int v9fs_serialize_fcall(struct v9fs_fcall *tcall, void *buf, u32 buflen,
 	int extended);
 int v9fs_deserialize_fcall(void *buf, u32 buflen, struct v9fs_fcall *rcall,
-	int rcalllen, int extended);
+	int extended);
+
+void v9fs_set_tag(struct v9fs_fcall *fc, u16 tag);
 
-/* this one is actually in error.c right now */
-int v9fs_errstr2errno(char *errstr);
+struct v9fs_fcall *v9fs_create_tversion(u32 msize, char *version);
+struct v9fs_fcall *v9fs_create_tauth(u32 afid, char *uname, char *aname);
+struct v9fs_fcall *v9fs_create_tattach(u32 fid, u32 afid, char *uname,
+	char *aname);
+struct v9fs_fcall *v9fs_create_tflush(u16 oldtag);
+struct v9fs_fcall *v9fs_create_twalk(u32 fid, u32 newfid, u16 nwname,
+	char **wnames);
+struct v9fs_fcall *v9fs_create_topen(u32 fid, u8 mode);
+struct v9fs_fcall *v9fs_create_tcreate(u32 fid, char *name, u32 perm, u8 mode);
+struct v9fs_fcall *v9fs_create_tread(u32 fid, u64 offset, u32 count);
+struct v9fs_fcall *v9fs_create_twrite(u32 fid, u64 offset, u32 count,
+	const char __user *data);
+struct v9fs_fcall *v9fs_create_tclunk(u32 fid);
+struct v9fs_fcall *v9fs_create_tremove(u32 fid);
+struct v9fs_fcall *v9fs_create_tstat(u32 fid);
+struct v9fs_fcall *v9fs_create_twstat(u32 fid, struct v9fs_wstat *wstat,
+	int extended);
diff --git a/fs/9p/debug.h b/fs/9p/debug.h
index 4445f06919..fe55103278 100644
--- a/fs/9p/debug.h
+++ b/fs/9p/debug.h
@@ -51,16 +51,23 @@ do { \
 #if DEBUG_DUMP_PKT
 static inline void dump_data(const unsigned char *data, unsigned int datalen)
 {
-	int i, j;
-	int len = datalen;
+	int i, n;
+	char buf[5*8];
 
-	printk(KERN_DEBUG "data ");
-	for (i = 0; i < len; i += 4) {
-		for (j = 0; (j < 4) && (i + j < len); j++)
-			printk(KERN_DEBUG "%02x", data[i + j]);
-		printk(KERN_DEBUG " ");
+	n = 0;
+	i = 0;
+	while (i < datalen) {
+		n += snprintf(buf+n, sizeof(buf)-n, "%02x", data[i++]);
+		if (i%4 == 0)
+			n += snprintf(buf+n, sizeof(buf)-n, " ");
+
+		if (i%16 == 0) {
+			dprintk(DEBUG_ERROR, "%s\n", buf);
+			n = 0;
+		}
 	}
-	printk(KERN_DEBUG "\n");
+
+	dprintk(DEBUG_ERROR, "%s\n", buf);
 }
 #else				/* DEBUG_DUMP_PKT */
 static inline void dump_data(const unsigned char *data, unsigned int datalen)
diff --git a/fs/9p/error.c b/fs/9p/error.c
index 834cb179e3..e4b6f8f38b 100644
--- a/fs/9p/error.c
+++ b/fs/9p/error.c
@@ -33,7 +33,6 @@
 
 #include <linux/list.h>
 #include <linux/jhash.h>
-#include <linux/string.h>
 
 #include "debug.h"
 #include "error.h"
@@ -55,7 +54,8 @@ int v9fs_error_init(void)
 
 	/* load initial error map into hash table */
 	for (c = errmap; c->name != NULL; c++) {
-		bucket = jhash(c->name, strlen(c->name), 0) % ERRHASHSZ;
+		c->namelen = strlen(c->name);
+		bucket = jhash(c->name, c->namelen, 0) % ERRHASHSZ;
 		INIT_HLIST_NODE(&c->list);
 		hlist_add_head(&c->list, &hash_errmap[bucket]);
 	}
@@ -69,15 +69,15 @@ int v9fs_error_init(void)
  *
  */
 
-int v9fs_errstr2errno(char *errstr)
+int v9fs_errstr2errno(char *errstr, int len)
 {
 	int errno = 0;
 	struct hlist_node *p = NULL;
 	struct errormap *c = NULL;
-	int bucket = jhash(errstr, strlen(errstr), 0) % ERRHASHSZ;
+	int bucket = jhash(errstr, len, 0) % ERRHASHSZ;
 
 	hlist_for_each_entry(c, p, &hash_errmap[bucket], list) {
-		if (!strcmp(c->name, errstr)) {
+		if (c->namelen==len && !memcmp(c->name, errstr, len)) {
 			errno = c->val;
 			break;
 		}
diff --git a/fs/9p/error.h b/fs/9p/error.h
index 78f89acf7c..8b3176b3f6 100644
--- a/fs/9p/error.h
+++ b/fs/9p/error.h
@@ -36,6 +36,7 @@ struct errormap {
 	char *name;
 	int val;
 
+	int namelen;
 	struct hlist_node list;
 };
 
@@ -175,4 +176,4 @@ static struct errormap errmap[] = {
 };
 
 extern int v9fs_error_init(void);
-extern int v9fs_errstr2errno(char *errstr);
+extern int v9fs_errstr2errno(char *errstr, int len);
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 60ef8aba75..eda449778f 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -31,9 +31,6 @@
 #include "v9fs.h"
 #include "9p.h"
 #include "v9fs_vfs.h"
-#include "transport.h"
-#include "mux.h"
-#include "conv.h"
 #include "fid.h"
 
 /**
diff --git a/fs/9p/mux.c b/fs/9p/mux.c
index 62b6ad0767..f21cf50839 100644
--- a/fs/9p/mux.c
+++ b/fs/9p/mux.c
@@ -35,8 +35,8 @@
 #include "debug.h"
 #include "v9fs.h"
 #include "9p.h"
-#include "transport.h"
 #include "conv.h"
+#include "transport.h"
 #include "mux.h"
 
 #define ERREQFLUSH	1
@@ -74,6 +74,7 @@ struct v9fs_mux_data {
 	wait_queue_head_t equeue;
 	struct list_head req_list;
 	struct list_head unsent_req_list;
+	struct v9fs_fcall *rcall;
 	int rpos;
 	char *rbuf;
 	int wpos;
@@ -101,11 +102,15 @@ struct v9fs_mux_rpc {
 	wait_queue_head_t wqueue;
 };
 
+extern int v9fs_errstr2errno(char *str, int len);
+
 static int v9fs_poll_proc(void *);
 static void v9fs_read_work(void *);
 static void v9fs_write_work(void *);
 static void v9fs_pollwait(struct file *filp, wait_queue_head_t * wait_address,
 			  poll_table * p);
+static u16 v9fs_mux_get_tag(struct v9fs_mux_data *);
+static void v9fs_mux_put_tag(struct v9fs_mux_data *, u16);
 
 static DECLARE_MUTEX(v9fs_mux_task_lock);
 static struct workqueue_struct *v9fs_mux_wq;
@@ -166,8 +171,9 @@ static void v9fs_mux_poll_start(struct v9fs_mux_data *m)
 			if (v9fs_mux_poll_tasks[i].task == NULL) {
 				vpt = &v9fs_mux_poll_tasks[i];
 				dprintk(DEBUG_MUX, "create proc %p\n", vpt);
-				vpt->task = kthread_create(v9fs_poll_proc,
-					vpt, "v9fs-poll");
+				vpt->task =
+				    kthread_create(v9fs_poll_proc, vpt,
+						   "v9fs-poll");
 				INIT_LIST_HEAD(&vpt->mux_list);
 				vpt->muxnum = 0;
 				v9fs_mux_poll_task_num++;
@@ -253,7 +259,7 @@ struct v9fs_mux_data *v9fs_mux_init(struct v9fs_transport *trans, int msize,
 	struct v9fs_mux_data *m, *mtmp;
 
 	dprintk(DEBUG_MUX, "transport %p msize %d\n", trans, msize);
-	m = kmalloc(sizeof(struct v9fs_mux_data) + 2 * msize, GFP_KERNEL);
+	m = kmalloc(sizeof(struct v9fs_mux_data), GFP_KERNEL);
 	if (!m)
 		return ERR_PTR(-ENOMEM);
 
@@ -268,10 +274,11 @@ struct v9fs_mux_data *v9fs_mux_init(struct v9fs_transport *trans, int msize,
 	init_waitqueue_head(&m->equeue);
 	INIT_LIST_HEAD(&m->req_list);
 	INIT_LIST_HEAD(&m->unsent_req_list);
+	m->rcall = NULL;
 	m->rpos = 0;
-	m->rbuf = (char *)m + sizeof(struct v9fs_mux_data);
+	m->rbuf = NULL;
 	m->wpos = m->wsize = 0;
-	m->wbuf = m->rbuf + msize;
+	m->wbuf = NULL;
 	INIT_WORK(&m->rq, v9fs_read_work, m);
 	INIT_WORK(&m->wq, v9fs_write_work, m);
 	m->wsched = 0;
@@ -427,29 +434,6 @@ static int v9fs_poll_proc(void *a)
 	return 0;
 }
 
-static inline int v9fs_write_req(struct v9fs_mux_data *m, struct v9fs_req *req)
-{
-	int n;
-
-	list_move_tail(&req->req_list, &m->req_list);
-	n = v9fs_serialize_fcall(req->tcall, m->wbuf, m->msize, *m->extended);
-	if (n < 0) {
-		req->err = n;
-		list_del(&req->req_list);
-		if (req->cb) {
-			spin_unlock(&m->lock);
-			(*req->cb) (req->cba, req->tcall, req->rcall, req->err);
-			req->cb = NULL;
-			spin_lock(&m->lock);
-		} else
-			kfree(req->rcall);
-
-		kfree(req);
-	}
-
-	return n;
-}
-
 /**
  * v9fs_write_work - called when a transport can send some data
  */
@@ -457,7 +441,7 @@ static void v9fs_write_work(void *a)
 {
 	int n, err;
 	struct v9fs_mux_data *m;
-	struct v9fs_req *req, *rtmp;
+	struct v9fs_req *req;
 
 	m = a;
 
@@ -472,17 +456,15 @@ static void v9fs_write_work(void *a)
 			return;
 		}
 
-		err = 0;
 		spin_lock(&m->lock);
-		list_for_each_entry_safe(req, rtmp, &m->unsent_req_list,
-					 req_list) {
-			err = v9fs_write_req(m, req);
-			if (err > 0)
-				break;
-		}
-
-		m->wsize = err;
+		req =
+		    list_entry(m->unsent_req_list.next, struct v9fs_req,
+			       req_list);
+		list_move_tail(&req->req_list, &m->req_list);
+		m->wbuf = req->tcall->sdata;
+		m->wsize = req->tcall->size;
 		m->wpos = 0;
+		dump_data(m->wbuf, m->wsize);
 		spin_unlock(&m->lock);
 	}
 
@@ -526,24 +508,23 @@ static void v9fs_write_work(void *a)
 static void process_request(struct v9fs_mux_data *m, struct v9fs_req *req)
 {
 	int ecode, tag;
-	char *ename;
+	struct v9fs_str *ename;
 
 	tag = req->tag;
 	if (req->rcall->id == RERROR && !req->err) {
 		ecode = req->rcall->params.rerror.errno;
-		ename = req->rcall->params.rerror.error;
+		ename = &req->rcall->params.rerror.error;
 
-		dprintk(DEBUG_MUX, "Rerror %s\n", ename);
+		dprintk(DEBUG_MUX, "Rerror %.*s\n", ename->len, ename->str);
 
 		if (*m->extended)
 			req->err = -ecode;
 
 		if (!req->err) {
-			req->err = v9fs_errstr2errno(ename);
+			req->err = v9fs_errstr2errno(ename->str, ename->len);
 
 			if (!req->err) {	/* string match failed */
-				dprintk(DEBUG_ERROR, "unknown error: %s\n",
-					ename);
+				PRINT_FCALL_ERROR("unknown error", req->rcall);
 			}
 
 			if (!req->err)
@@ -565,8 +546,7 @@ static void process_request(struct v9fs_mux_data *m, struct v9fs_req *req)
 	} else
 		kfree(req->rcall);
 
-	if (tag != V9FS_NOTAG)
-		v9fs_put_idpool(tag, &m->tidpool);
+	v9fs_mux_put_tag(m, tag);
 
 	wake_up(&m->equeue);
 	kfree(req);
@@ -577,10 +557,11 @@ static void process_request(struct v9fs_mux_data *m, struct v9fs_req *req)
  */
 static void v9fs_read_work(void *a)
 {
-	int n, err, rcallen;
+	int n, err;
 	struct v9fs_mux_data *m;
 	struct v9fs_req *req, *rptr, *rreq;
 	struct v9fs_fcall *rcall;
+	char *rbuf;
 
 	m = a;
 
@@ -589,6 +570,19 @@ static void v9fs_read_work(void *a)
 
 	rcall = NULL;
 	dprintk(DEBUG_MUX, "start mux %p pos %d\n", m, m->rpos);
+
+	if (!m->rcall) {
+		m->rcall =
+		    kmalloc(sizeof(struct v9fs_fcall) + m->msize, GFP_KERNEL);
+		if (!m->rcall) {
+			err = -ENOMEM;
+			goto error;
+		}
+
+		m->rbuf = (char *)m->rcall + sizeof(struct v9fs_fcall);
+		m->rpos = 0;
+	}
+
 	clear_bit(Rpending, &m->wsched);
 	err = m->trans->read(m->trans, m->rbuf + m->rpos, m->msize - m->rpos);
 	dprintk(DEBUG_MUX, "mux %p got %d bytes\n", m, err);
@@ -613,21 +607,32 @@ static void v9fs_read_work(void *a)
 		if (m->rpos < n)
 			break;
 
-		rcallen = n + V9FS_FCALLHDRSZ;
-		rcall = kmalloc(rcallen, GFP_KERNEL);
-		if (!rcall) {
-			err = -ENOMEM;
-			goto error;
-		}
-
 		dump_data(m->rbuf, n);
-		err = v9fs_deserialize_fcall(m->rbuf, n, rcall, rcallen,
-					     *m->extended);
+		err =
+		    v9fs_deserialize_fcall(m->rbuf, n, m->rcall, *m->extended);
 		if (err < 0) {
-			kfree(rcall);
 			goto error;
 		}
 
+		rcall = m->rcall;
+		rbuf = m->rbuf;
+		if (m->rpos > n) {
+			m->rcall = kmalloc(sizeof(struct v9fs_fcall) + m->msize,
+					   GFP_KERNEL);
+			if (!m->rcall) {
+				err = -ENOMEM;
+				goto error;
+			}
+
+			m->rbuf = (char *)m->rcall + sizeof(struct v9fs_fcall);
+			memmove(m->rbuf, rbuf + n, m->rpos - n);
+			m->rpos -= n;
+		} else {
+			m->rcall = NULL;
+			m->rbuf = NULL;
+			m->rpos = 0;
+		}
+
 		dprintk(DEBUG_MUX, "mux %p fcall id %d tag %d\n", m, rcall->id,
 			rcall->tag);
 
@@ -642,6 +647,7 @@ static void v9fs_read_work(void *a)
 				process_request(m, req);
 				break;
 			}
+
 		}
 
 		if (!req) {
@@ -652,10 +658,6 @@ static void v9fs_read_work(void *a)
 					m, rcall->id, rcall->tag);
 			kfree(rcall);
 		}
-
-		if (m->rpos > n)
-			memmove(m->rbuf, m->rbuf + n, m->rpos - n);
-		m->rpos -= n;
 	}
 
 	if (!list_empty(&m->req_list)) {
@@ -710,12 +712,13 @@ static struct v9fs_req *v9fs_send_request(struct v9fs_mux_data *m,
 	if (tc->id == TVERSION)
 		n = V9FS_NOTAG;
 	else
-		n = v9fs_get_idpool(&m->tidpool);
+		n = v9fs_mux_get_tag(m);
 
 	if (n < 0)
 		return ERR_PTR(-ENOMEM);
 
-	tc->tag = n;
+	v9fs_set_tag(tc, n);
+
 	req->tag = n;
 	req->tcall = tc;
 	req->rcall = NULL;
@@ -773,9 +776,7 @@ v9fs_mux_flush_cb(void *a, struct v9fs_fcall *tc, struct v9fs_fcall *rc,
 	if (!cb)
 		spin_unlock(&m->lock);
 
-	if (v9fs_check_idpool(tag, &m->tidpool))
-		v9fs_put_idpool(tag, &m->tidpool);
-
+	v9fs_mux_put_tag(m, tag);
 	kfree(tc);
 	kfree(rc);
 }
@@ -787,10 +788,7 @@ v9fs_mux_flush_request(struct v9fs_mux_data *m, struct v9fs_req *req)
 
 	dprintk(DEBUG_MUX, "mux %p req %p tag %d\n", m, req, req->tag);
 
-	fc = kmalloc(sizeof(struct v9fs_fcall), GFP_KERNEL);
-	fc->id = TFLUSH;
-	fc->params.tflush.oldtag = req->tag;
-
+	fc = v9fs_create_tflush(req->tag);
 	v9fs_send_request(m, fc, v9fs_mux_flush_cb, m);
 }
 
@@ -939,3 +937,20 @@ void v9fs_mux_cancel(struct v9fs_mux_data *m, int err)
 
 	wake_up(&m->equeue);
 }
+
+static u16 v9fs_mux_get_tag(struct v9fs_mux_data *m)
+{
+	int tag;
+
+	tag = v9fs_get_idpool(&m->tidpool);
+	if (tag < 0)
+		return V9FS_NOTAG;
+	else
+		return (u16) tag;
+}
+
+static void v9fs_mux_put_tag(struct v9fs_mux_data *m, u16 tag)
+{
+	if (tag != V9FS_NOTAG && v9fs_check_idpool(tag, &m->tidpool))
+		v9fs_put_idpool(tag, &m->tidpool);
+}
diff --git a/fs/9p/trans_sock.c b/fs/9p/trans_sock.c
index 9ef404c75c..44e830697a 100644
--- a/fs/9p/trans_sock.c
+++ b/fs/9p/trans_sock.c
@@ -110,7 +110,6 @@ static int v9fs_sock_send(struct v9fs_transport *trans, void *v, int len)
 	if (!(ts->filp->f_flags & O_NONBLOCK))
 		dprintk(DEBUG_ERROR, "blocking write ...\n");
 
-	dump_data(v, len);
 	oldfs = get_fs();
 	set_fs(get_ds());
 	ret = vfs_write(ts->filp, (void __user *)v, len, &ts->filp->f_pos);
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 5e0f79355f..519b21d8b1 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -37,7 +37,6 @@
 #include "v9fs_vfs.h"
 #include "transport.h"
 #include "mux.h"
-#include "conv.h"
 
 /* TODO: sysfs or debugfs interface */
 int v9fs_debug_level = 0;	/* feature-rific global debug level  */
@@ -353,7 +352,7 @@ v9fs_session_init(struct v9fs_session_info *v9ses,
 		}
 
 		/* Really should check for 9P1 and report error */
-		if (!strcmp(fcall->params.rversion.version, "9P2000.u")) {
+		if (!v9fs_str_compare("9P2000.u", &fcall->params.rversion.version)) {
 			dprintk(DEBUG_9P, "9P2000 UNIX extensions enabled\n");
 			v9ses->extended = 1;
 		} else {
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 2f2cea7ee3..c78502ad00 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -45,9 +45,8 @@ extern struct dentry_operations v9fs_dentry_operations;
 
 struct inode *v9fs_get_inode(struct super_block *sb, int mode);
 ino_t v9fs_qid2ino(struct v9fs_qid *qid);
-void v9fs_mistat2inode(struct v9fs_stat *, struct inode *,
-		       struct super_block *);
+void v9fs_stat2inode(struct v9fs_stat *, struct inode *, struct super_block *);
 int v9fs_dir_release(struct inode *inode, struct file *filp);
 int v9fs_file_open(struct inode *inode, struct file *file);
-void v9fs_inode2mistat(struct inode *inode, struct v9fs_stat *mistat);
+void v9fs_inode2stat(struct inode *inode, struct v9fs_stat *stat);
 void v9fs_dentry_release(struct dentry *);
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index 4887df7673..2dd806dac9 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -40,7 +40,6 @@
 #include "v9fs.h"
 #include "9p.h"
 #include "v9fs_vfs.h"
-#include "conv.h"
 #include "fid.h"
 
 /**
@@ -108,7 +107,8 @@ void v9fs_dentry_release(struct dentry *dentry)
 			err = v9fs_t_clunk(current_fid->v9ses, current_fid->fid);
 
 			if (err < 0)
-				dprintk(DEBUG_ERROR, "clunk failed: %d\n", err);
+				dprintk(DEBUG_ERROR, "clunk failed: %d name %s\n",
+					err, dentry->d_iname);
 
 			v9fs_fid_destroy(current_fid);
 		}
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 3893dd307d..ae6d032b9b 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -37,8 +37,8 @@
 #include "debug.h"
 #include "v9fs.h"
 #include "9p.h"
-#include "v9fs_vfs.h"
 #include "conv.h"
+#include "v9fs_vfs.h"
 #include "fid.h"
 
 /**
@@ -77,17 +77,13 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	unsigned int i, n, s;
 	int fid = -1;
 	int ret = 0;
-	struct v9fs_stat *mi = NULL;
+	struct v9fs_stat stat;
 	int over = 0;
 
 	dprintk(DEBUG_VFS, "name %s\n", filp->f_dentry->d_name.name);
 
 	fid = file->fid;
 
-	mi = kmalloc(v9ses->maxdata, GFP_KERNEL);
-	if (!mi)
-		return -ENOMEM;
-
 	if (file->rdir_fcall && (filp->f_pos != file->rdir_pos)) {
 		kfree(file->rdir_fcall);
 		file->rdir_fcall = NULL;
@@ -99,18 +95,18 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		while (i < n) {
 			s = v9fs_deserialize_stat(
 				file->rdir_fcall->params.rread.data + i,
-				n - i, mi, v9ses->maxdata, v9ses->extended);
+				n - i, &stat, v9ses->extended);
 
 			if (s == 0) {
 				dprintk(DEBUG_ERROR,
-					"error while deserializing mistat\n");
+					"error while deserializing stat\n");
 				ret = -EIO;
 				goto FreeStructs;
 			}
 
-			over = filldir(dirent, mi->name, strlen(mi->name),
-				    filp->f_pos, v9fs_qid2ino(&mi->qid),
-				    dt_type(mi));
+			over = filldir(dirent, stat.name.str, stat.name.len,
+				    filp->f_pos, v9fs_qid2ino(&stat.qid),
+				    dt_type(&stat));
 
 			if (over) {
 				file->rdir_fpos = i;
@@ -130,7 +126,7 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
 
 	while (!over) {
 		ret = v9fs_t_read(v9ses, fid, filp->f_pos,
-					    v9ses->maxdata-V9FS_IOHDRSZ, &fcall);
+			v9ses->maxdata-V9FS_IOHDRSZ, &fcall);
 		if (ret < 0) {
 			dprintk(DEBUG_ERROR, "error while reading: %d: %p\n",
 				ret, fcall);
@@ -142,17 +138,17 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		i = 0;
 		while (i < n) {
 			s = v9fs_deserialize_stat(fcall->params.rread.data + i,
-				n - i, mi, v9ses->maxdata, v9ses->extended);
+				n - i, &stat, v9ses->extended);
 
 			if (s == 0) {
 				dprintk(DEBUG_ERROR,
-					"error while deserializing mistat\n");
+					"error while deserializing stat\n");
 				return -EIO;
 			}
 
-			over = filldir(dirent, mi->name, strlen(mi->name),
-				    filp->f_pos, v9fs_qid2ino(&mi->qid),
-				    dt_type(mi));
+			over = filldir(dirent, stat.name.str, stat.name.len,
+				    filp->f_pos, v9fs_qid2ino(&stat.qid),
+				    dt_type(&stat));
 
 			if (over) {
 				file->rdir_fcall = fcall;
@@ -171,7 +167,6 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
 
       FreeStructs:
 	kfree(fcall);
-	kfree(mi);
 	return ret;
 }
 
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index e13577da91..6852f0eb96 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -32,6 +32,7 @@
 #include <linux/string.h>
 #include <linux/smp_lock.h>
 #include <linux/inet.h>
+#include <linux/version.h>
 #include <linux/list.h>
 #include <asm/uaccess.h>
 #include <linux/idr.h>
@@ -117,9 +118,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
 
 		result = v9fs_t_open(v9ses, newfid, open_mode, &fcall);
 		if (result < 0) {
-			dprintk(DEBUG_ERROR,
-				"open failed, open_mode 0x%x: %s\n", open_mode,
-				FCALL_ERROR(fcall));
+			PRINT_FCALL_ERROR("open failed", fcall);
 			kfree(fcall);
 			return result;
 		}
@@ -256,7 +255,6 @@ v9fs_file_write(struct file *filp, const char __user * data,
 	int result = -EIO;
 	int rsize = 0;
 	int total = 0;
-	char *buf;
 
 	dprintk(DEBUG_VFS, "data %p count %d offset %x\n", data, (int)count,
 		(int)*offset);
@@ -264,28 +262,14 @@ v9fs_file_write(struct file *filp, const char __user * data,
 	if (v9fid->iounit != 0 && rsize > v9fid->iounit)
 		rsize = v9fid->iounit;
 
-	buf = kmalloc(v9ses->maxdata - V9FS_IOHDRSZ, GFP_KERNEL);
-	if (!buf)
-		return -ENOMEM;
-
 	do {
 		if (count < rsize)
 			rsize = count;
 
-		result = copy_from_user(buf, data, rsize);
-		if (result) {
-			dprintk(DEBUG_ERROR, "Problem copying from user\n");
-			kfree(buf);
-			return -EFAULT;
-		}
-
-		dump_data(buf, rsize);
-		result = v9fs_t_write(v9ses, fid, *offset, rsize, buf, &fcall);
+		result = v9fs_t_write(v9ses, fid, *offset, rsize, data, &fcall);
 		if (result < 0) {
-			eprintk(KERN_ERR, "error while writing: %s(%d)\n",
-				FCALL_ERROR(fcall), result);
+			PRINT_FCALL_ERROR("error while writing", fcall);
 			kfree(fcall);
-			kfree(buf);
 			return result;
 		} else
 			*offset += result;
@@ -305,7 +289,6 @@ v9fs_file_write(struct file *filp, const char __user * data,
 		total += result;
 	} while (count);
 
-	kfree(buf);
 	return total;
 }
 
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index f11edde643..742bcd0dc4 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -40,7 +40,6 @@
 #include "v9fs.h"
 #include "9p.h"
 #include "v9fs_vfs.h"
-#include "conv.h"
 #include "fid.h"
 
 static struct inode_operations v9fs_dir_inode_operations;
@@ -127,100 +126,32 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode)
 }
 
 /**
- * v9fs_blank_mistat - helper function to setup a 9P stat structure
+ * v9fs_blank_wstat - helper function to setup a 9P stat structure
  * @v9ses: 9P session info (for determining extended mode)
- * @mistat: structure to initialize
+ * @wstat: structure to initialize
  *
  */
 
 static void
-v9fs_blank_mistat(struct v9fs_session_info *v9ses, struct v9fs_stat *mistat)
+v9fs_blank_wstat(struct v9fs_wstat *wstat)
 {
-	mistat->type = ~0;
-	mistat->dev = ~0;
-	mistat->qid.type = ~0;
-	mistat->qid.version = ~0;
-	*((long long *)&mistat->qid.path) = ~0;
-	mistat->mode = ~0;
-	mistat->atime = ~0;
-	mistat->mtime = ~0;
-	mistat->length = ~0;
-	mistat->name = mistat->data;
-	mistat->uid = mistat->data;
-	mistat->gid = mistat->data;
-	mistat->muid = mistat->data;
-	if (v9ses->extended) {
-		mistat->n_uid = ~0;
-		mistat->n_gid = ~0;
-		mistat->n_muid = ~0;
-		mistat->extension = mistat->data;
-	}
-	*mistat->data = 0;
-}
-
-/**
- * v9fs_mistat2unix - convert mistat to unix stat
- * @mistat: Plan 9 metadata (mistat) structure
- * @buf: unix metadata (stat) structure to populate
- * @sb: superblock
- *
- */
-
-static void
-v9fs_mistat2unix(struct v9fs_stat *mistat, struct stat *buf,
-		 struct super_block *sb)
-{
-	struct v9fs_session_info *v9ses = sb ? sb->s_fs_info : NULL;
-
-	buf->st_nlink = 1;
-
-	buf->st_atime = mistat->atime;
-	buf->st_mtime = mistat->mtime;
-	buf->st_ctime = mistat->mtime;
-
-	buf->st_uid = (unsigned short)-1;
-	buf->st_gid = (unsigned short)-1;
-
-	if (v9ses && v9ses->extended) {
-		/* TODO: string to uid mapping via user-space daemon */
-		if (mistat->n_uid != -1)
-			sscanf(mistat->uid, "%x", (unsigned int *)&buf->st_uid);
-
-		if (mistat->n_gid != -1)
-			sscanf(mistat->gid, "%x", (unsigned int *)&buf->st_gid);
-	}
-
-	if (buf->st_uid == (unsigned short)-1)
-		buf->st_uid = v9ses->uid;
-	if (buf->st_gid == (unsigned short)-1)
-		buf->st_gid = v9ses->gid;
-
-	buf->st_mode = p9mode2unixmode(v9ses, mistat->mode);
-	if ((S_ISBLK(buf->st_mode)) || (S_ISCHR(buf->st_mode))) {
-		char type = 0;
-		int major = -1;
-		int minor = -1;
-		sscanf(mistat->extension, "%c %u %u", &type, &major, &minor);
-		switch (type) {
-		case 'c':
-			buf->st_mode &= ~S_IFBLK;
-			buf->st_mode |= S_IFCHR;
-			break;
-		case 'b':
-			break;
-		default:
-			dprintk(DEBUG_ERROR, "Unknown special type %c (%s)\n",
-				type, mistat->extension);
-		};
-		buf->st_rdev = MKDEV(major, minor);
-	} else
-		buf->st_rdev = 0;
-
-	buf->st_size = mistat->length;
-
-	buf->st_blksize = sb->s_blocksize;
-	buf->st_blocks =
-	    (buf->st_size + buf->st_blksize - 1) >> sb->s_blocksize_bits;
+	wstat->type = ~0;
+	wstat->dev = ~0;
+	wstat->qid.type = ~0;
+	wstat->qid.version = ~0;
+	*((long long *)&wstat->qid.path) = ~0;
+	wstat->mode = ~0;
+	wstat->atime = ~0;
+	wstat->mtime = ~0;
+	wstat->length = ~0;
+	wstat->name = NULL;
+	wstat->uid = NULL;
+	wstat->gid = NULL;
+	wstat->muid = NULL;
+	wstat->n_uid = ~0;
+	wstat->n_gid = ~0;
+	wstat->n_muid = ~0;
+	wstat->extension = NULL;
 }
 
 /**
@@ -312,7 +243,6 @@ v9fs_create(struct inode *dir,
 	struct inode *file_inode = NULL;
 	struct v9fs_fcall *fcall = NULL;
 	struct v9fs_qid qid;
-	struct stat newstat;
 	int dirfidnum = -1;
 	long newfid = -1;
 	int result = 0;
@@ -350,7 +280,7 @@ v9fs_create(struct inode *dir,
 
 	result = v9fs_t_walk(v9ses, dirfidnum, newfid, NULL, &fcall);
 	if (result < 0) {
-		dprintk(DEBUG_ERROR, "clone error: %s\n", FCALL_ERROR(fcall));
+		PRINT_FCALL_ERROR("clone error", fcall);
 		v9fs_put_idpool(newfid, &v9ses->fidpool);
 		newfid = -1;
 		goto CleanUpFid;
@@ -362,9 +292,7 @@ v9fs_create(struct inode *dir,
 	result = v9fs_t_create(v9ses, newfid, (char *)file_dentry->d_name.name,
 			       perm, open_mode, &fcall);
 	if (result < 0) {
-		dprintk(DEBUG_ERROR, "create fails: %s(%d)\n",
-			FCALL_ERROR(fcall), result);
-
+		PRINT_FCALL_ERROR("create fails", fcall);
 		goto CleanUpFid;
 	}
 
@@ -400,7 +328,7 @@ v9fs_create(struct inode *dir,
 	result = v9fs_t_walk(v9ses, dirfidnum, wfidno,
 		(char *) file_dentry->d_name.name, &fcall);
 	if (result < 0) {
-		dprintk(DEBUG_ERROR, "clone error: %s\n", FCALL_ERROR(fcall));
+		PRINT_FCALL_ERROR("clone error", fcall);
 		v9fs_put_idpool(wfidno, &v9ses->fidpool);
 		wfidno = -1;
 		goto CleanUpFid;
@@ -421,21 +349,21 @@ v9fs_create(struct inode *dir,
 
 	result = v9fs_t_stat(v9ses, wfidno, &fcall);
 	if (result < 0) {
-		dprintk(DEBUG_ERROR, "stat error: %s(%d)\n", FCALL_ERROR(fcall),
-			result);
+		PRINT_FCALL_ERROR("stat error", fcall);
 		goto CleanUpFid;
 	}
 
-	v9fs_mistat2unix(fcall->params.rstat.stat, &newstat, sb);
 
-	file_inode = v9fs_get_inode(sb, newstat.st_mode);
+	file_inode = v9fs_get_inode(sb,
+		p9mode2unixmode(v9ses, fcall->params.rstat.stat.mode));
+
 	if ((!file_inode) || IS_ERR(file_inode)) {
 		dprintk(DEBUG_ERROR, "create inode failed\n");
 		result = -EBADF;
 		goto CleanUpFid;
 	}
 
-	v9fs_mistat2inode(fcall->params.rstat.stat, file_inode, sb);
+	v9fs_stat2inode(&fcall->params.rstat.stat, file_inode, sb);
 	kfree(fcall);
 	fcall = NULL;
 	file_dentry->d_op = &v9fs_dentry_operations;
@@ -500,10 +428,9 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
 	}
 
 	result = v9fs_t_remove(v9ses, fid, &fcall);
-	if (result < 0)
-		dprintk(DEBUG_ERROR, "remove of file fails: %s(%d)\n",
-			FCALL_ERROR(fcall), result);
-	else {
+	if (result < 0) {
+		PRINT_FCALL_ERROR("remove fails", fcall);
+	} else {
 		v9fs_put_idpool(fid, &v9ses->fidpool);
 		v9fs_fid_destroy(v9fid);
 	}
@@ -558,7 +485,6 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 	struct v9fs_fid *fid;
 	struct inode *inode;
 	struct v9fs_fcall *fcall = NULL;
-	struct stat newstat;
 	int dirfidnum = -1;
 	int newfid = -1;
 	int result = 0;
@@ -611,8 +537,8 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 		goto FreeFcall;
 	}
 
-	v9fs_mistat2unix(fcall->params.rstat.stat, &newstat, sb);
-	inode = v9fs_get_inode(sb, newstat.st_mode);
+	inode = v9fs_get_inode(sb, p9mode2unixmode(v9ses,
+		fcall->params.rstat.stat.mode));
 
 	if (IS_ERR(inode) && (PTR_ERR(inode) == -ENOSPC)) {
 		eprintk(KERN_WARNING, "inode alloc failes, returns %ld\n",
@@ -622,7 +548,7 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 		goto FreeFcall;
 	}
 
-	inode->i_ino = v9fs_qid2ino(&fcall->params.rstat.stat->qid);
+	inode->i_ino = v9fs_qid2ino(&fcall->params.rstat.stat.qid);
 
 	fid = v9fs_fid_create(dentry, v9ses, newfid, 0);
 	if (fid == NULL) {
@@ -631,10 +557,10 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 		goto FreeFcall;
 	}
 
-	fid->qid = fcall->params.rstat.stat->qid;
+	fid->qid = fcall->params.rstat.stat.qid;
 
 	dentry->d_op = &v9fs_dentry_operations;
-	v9fs_mistat2inode(fcall->params.rstat.stat, inode, inode->i_sb);
+	v9fs_stat2inode(&fcall->params.rstat.stat, inode, inode->i_sb);
 
 	d_add(dentry, inode);
 	kfree(fcall);
@@ -690,7 +616,7 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	    v9fs_fid_lookup(old_dentry->d_parent);
 	struct v9fs_fid *newdirfid =
 	    v9fs_fid_lookup(new_dentry->d_parent);
-	struct v9fs_stat *mistat = kmalloc(v9ses->maxdata, GFP_KERNEL);
+	struct v9fs_wstat wstat;
 	struct v9fs_fcall *fcall = NULL;
 	int fid = -1;
 	int olddirfidnum = -1;
@@ -699,9 +625,6 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 	dprintk(DEBUG_VFS, "\n");
 
-	if (!mistat)
-		return -ENOMEM;
-
 	if ((!oldfid) || (!olddirfid) || (!newdirfid)) {
 		dprintk(DEBUG_ERROR, "problem with arguments\n");
 		return -EBADF;
@@ -725,26 +648,15 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		goto FreeFcallnBail;
 	}
 
-	v9fs_blank_mistat(v9ses, mistat);
-
-	strcpy(mistat->data + 1, v9ses->name);
-	mistat->name = mistat->data + 1 + strlen(v9ses->name);
-
-	if (new_dentry->d_name.len >
-	    (v9ses->maxdata - strlen(v9ses->name) - sizeof(struct v9fs_stat))) {
-		dprintk(DEBUG_ERROR, "new name too long\n");
-		goto FreeFcallnBail;
-	}
+	v9fs_blank_wstat(&wstat);
+	wstat.muid = v9ses->name;
+	wstat.name = (char *) new_dentry->d_name.name;
 
-	strcpy(mistat->name, new_dentry->d_name.name);
-	retval = v9fs_t_wstat(v9ses, fid, mistat, &fcall);
+	retval = v9fs_t_wstat(v9ses, fid, &wstat, &fcall);
 
       FreeFcallnBail:
-	kfree(mistat);
-
 	if (retval < 0)
-		dprintk(DEBUG_ERROR, "v9fs_t_wstat error: %s\n",
-			FCALL_ERROR(fcall));
+		PRINT_FCALL_ERROR("wstat error", fcall);
 
 	kfree(fcall);
 	return retval;
@@ -779,7 +691,7 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 	if (err < 0)
 		dprintk(DEBUG_ERROR, "stat error\n");
 	else {
-		v9fs_mistat2inode(fcall->params.rstat.stat, dentry->d_inode,
+		v9fs_stat2inode(&fcall->params.rstat.stat, dentry->d_inode,
 				  dentry->d_inode->i_sb);
 		generic_fillattr(dentry->d_inode, stat);
 	}
@@ -800,57 +712,44 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
 	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dentry->d_inode);
 	struct v9fs_fid *fid = v9fs_fid_lookup(dentry);
 	struct v9fs_fcall *fcall = NULL;
-	struct v9fs_stat *mistat = kmalloc(v9ses->maxdata, GFP_KERNEL);
+	struct v9fs_wstat wstat;
 	int res = -EPERM;
 
 	dprintk(DEBUG_VFS, "\n");
 
-	if (!mistat)
-		return -ENOMEM;
-
 	if (!fid) {
 		dprintk(DEBUG_ERROR,
 			"Couldn't find fid associated with dentry\n");
 		return -EBADF;
 	}
 
-	v9fs_blank_mistat(v9ses, mistat);
+	v9fs_blank_wstat(&wstat);
 	if (iattr->ia_valid & ATTR_MODE)
-		mistat->mode = unixmode2p9mode(v9ses, iattr->ia_mode);
+		wstat.mode = unixmode2p9mode(v9ses, iattr->ia_mode);
 
 	if (iattr->ia_valid & ATTR_MTIME)
-		mistat->mtime = iattr->ia_mtime.tv_sec;
+		wstat.mtime = iattr->ia_mtime.tv_sec;
 
 	if (iattr->ia_valid & ATTR_ATIME)
-		mistat->atime = iattr->ia_atime.tv_sec;
+		wstat.atime = iattr->ia_atime.tv_sec;
 
 	if (iattr->ia_valid & ATTR_SIZE)
-		mistat->length = iattr->ia_size;
+		wstat.length = iattr->ia_size;
 
 	if (v9ses->extended) {
-		char *ptr = mistat->data+1;
+		if (iattr->ia_valid & ATTR_UID)
+			wstat.n_uid = iattr->ia_uid;
 
-		if (iattr->ia_valid & ATTR_UID) {
-			mistat->uid = ptr;
-			ptr += 1+sprintf(ptr, "%08x", iattr->ia_uid);
-			mistat->n_uid = iattr->ia_uid;
-		}
-
-		if (iattr->ia_valid & ATTR_GID) {
-			mistat->gid = ptr;
-			ptr += 1+sprintf(ptr, "%08x", iattr->ia_gid);
-			mistat->n_gid = iattr->ia_gid;
-		}
+		if (iattr->ia_valid & ATTR_GID)
+			wstat.n_gid = iattr->ia_gid;
 	}
 
-	res = v9fs_t_wstat(v9ses, fid->fid, mistat, &fcall);
+	res = v9fs_t_wstat(v9ses, fid->fid, &wstat, &fcall);
 
 	if (res < 0)
-		dprintk(DEBUG_ERROR, "wstat error: %s\n", FCALL_ERROR(fcall));
+		PRINT_FCALL_ERROR("wstat error", fcall);
 
-	kfree(mistat);
 	kfree(fcall);
-
 	if (res >= 0)
 		res = inode_setattr(dentry->d_inode, iattr);
 
@@ -858,51 +757,42 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
 }
 
 /**
- * v9fs_mistat2inode - populate an inode structure with mistat info
- * @mistat: Plan 9 metadata (mistat) structure
+ * v9fs_stat2inode - populate an inode structure with mistat info
+ * @stat: Plan 9 metadata (mistat) structure
  * @inode: inode to populate
  * @sb: superblock of filesystem
  *
  */
 
 void
-v9fs_mistat2inode(struct v9fs_stat *mistat, struct inode *inode,
-		  struct super_block *sb)
+v9fs_stat2inode(struct v9fs_stat *stat, struct inode *inode,
+	struct super_block *sb)
 {
+	char ext[32];
 	struct v9fs_session_info *v9ses = sb->s_fs_info;
 
 	inode->i_nlink = 1;
 
-	inode->i_atime.tv_sec = mistat->atime;
-	inode->i_mtime.tv_sec = mistat->mtime;
-	inode->i_ctime.tv_sec = mistat->mtime;
+	inode->i_atime.tv_sec = stat->atime;
+	inode->i_mtime.tv_sec = stat->mtime;
+	inode->i_ctime.tv_sec = stat->mtime;
 
-	inode->i_uid = -1;
-	inode->i_gid = -1;
+	inode->i_uid = v9ses->uid;
+	inode->i_gid = v9ses->gid;
 
 	if (v9ses->extended) {
-		/* TODO: string to uid mapping via user-space daemon */
-		inode->i_uid = mistat->n_uid;
-		inode->i_gid = mistat->n_gid;
-
-		if (mistat->n_uid == -1)
-			sscanf(mistat->uid, "%x", &inode->i_uid);
-
-		if (mistat->n_gid == -1)
-			sscanf(mistat->gid, "%x", &inode->i_gid);
+		inode->i_uid = stat->n_uid;
+		inode->i_gid = stat->n_gid;
 	}
 
-	if (inode->i_uid == -1)
-		inode->i_uid = v9ses->uid;
-	if (inode->i_gid == -1)
-		inode->i_gid = v9ses->gid;
-
-	inode->i_mode = p9mode2unixmode(v9ses, mistat->mode);
+	inode->i_mode = p9mode2unixmode(v9ses, stat->mode);
 	if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode))) {
 		char type = 0;
 		int major = -1;
 		int minor = -1;
-		sscanf(mistat->extension, "%c %u %u", &type, &major, &minor);
+
+		v9fs_str_copy(ext, sizeof(ext), &stat->extension);
+		sscanf(ext, "%c %u %u", &type, &major, &minor);
 		switch (type) {
 		case 'c':
 			inode->i_mode &= ~S_IFBLK;
@@ -911,14 +801,14 @@ v9fs_mistat2inode(struct v9fs_stat *mistat, struct inode *inode,
 		case 'b':
 			break;
 		default:
-			dprintk(DEBUG_ERROR, "Unknown special type %c (%s)\n",
-				type, mistat->extension);
+			dprintk(DEBUG_ERROR, "Unknown special type %c (%.*s)\n",
+				type, stat->extension.len, stat->extension.str);
 		};
 		inode->i_rdev = MKDEV(major, minor);
 	} else
 		inode->i_rdev = 0;
 
-	inode->i_size = mistat->length;
+	inode->i_size = stat->length;
 
 	inode->i_blksize = sb->s_blocksize;
 	inode->i_blocks =
@@ -945,72 +835,6 @@ ino_t v9fs_qid2ino(struct v9fs_qid *qid)
 	return i;
 }
 
-/**
- * v9fs_vfs_symlink - helper function to create symlinks
- * @dir: directory inode containing symlink
- * @dentry: dentry for symlink
- * @symname: symlink data
- *
- * See 9P2000.u RFC for more information
- *
- */
-
-static int
-v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
-{
-	int retval = -EPERM;
-	struct v9fs_fid *newfid;
-	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir);
-	struct v9fs_fcall *fcall = NULL;
-	struct v9fs_stat *mistat = kmalloc(v9ses->maxdata, GFP_KERNEL);
-	int err;
-
-	dprintk(DEBUG_VFS, " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name,
-		symname);
-
-	if (!mistat)
-		return -ENOMEM;
-
-	if (!v9ses->extended) {
-		dprintk(DEBUG_ERROR, "not extended\n");
-		goto FreeFcall;
-	}
-
-	/* issue a create */
-	retval = v9fs_create(dir, dentry, S_IFLNK, 0);
-	if (retval != 0)
-		goto FreeFcall;
-
-	newfid = v9fs_fid_lookup(dentry);
-
-	/* issue a twstat */
-	v9fs_blank_mistat(v9ses, mistat);
-	strcpy(mistat->data + 1, symname);
-	mistat->extension = mistat->data + 1;
-	retval = v9fs_t_wstat(v9ses, newfid->fid, mistat, &fcall);
-	if (retval < 0) {
-		dprintk(DEBUG_ERROR, "v9fs_t_wstat error: %s\n",
-			FCALL_ERROR(fcall));
-		goto FreeFcall;
-	}
-
-	kfree(fcall);
-
-	err = v9fs_t_clunk(v9ses, newfid->fid);
-	if (err < 0) {
-		dprintk(DEBUG_ERROR, "clunk for symlink failed: %d\n", err);
-		goto FreeFcall;
-	}
-
-	d_drop(dentry);		/* FID - will this also clunk? */
-
-      FreeFcall:
-	kfree(mistat);
-	kfree(fcall);
-
-	return retval;
-}
-
 /**
  * v9fs_readlink - read a symlink's location (internal version)
  * @dentry: dentry for symlink
@@ -1050,16 +874,17 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
 	if (!fcall)
 		return -EIO;
 
-	if (!(fcall->params.rstat.stat->mode & V9FS_DMSYMLINK)) {
+	if (!(fcall->params.rstat.stat.mode & V9FS_DMSYMLINK)) {
 		retval = -EINVAL;
 		goto FreeFcall;
 	}
 
 	/* copy extension buffer into buffer */
-	if (strlen(fcall->params.rstat.stat->extension) < buflen)
-		buflen = strlen(fcall->params.rstat.stat->extension);
+	if (fcall->params.rstat.stat.extension.len < buflen)
+		buflen = fcall->params.rstat.stat.extension.len;
 
-	memcpy(buffer, fcall->params.rstat.stat->extension, buflen + 1);
+	memcpy(buffer, fcall->params.rstat.stat.extension.str, buflen - 1);
+	buffer[buflen-1] = 0;
 
 	retval = buflen;
 
@@ -1149,82 +974,111 @@ static void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void
 		__putname(s);
 }
 
-/**
- * v9fs_vfs_link - create a hardlink
- * @old_dentry: dentry for file to link to
- * @dir: inode destination for new link
- * @dentry: dentry for link
- *
- */
-
-/* XXX - lots of code dup'd from symlink and creates,
- * figure out a better reuse strategy
- */
-
-static int
-v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
-	      struct dentry *dentry)
+static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
+	int mode, const char *extension)
 {
-	int retval = -EPERM;
-	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir);
-	struct v9fs_fcall *fcall = NULL;
-	struct v9fs_stat *mistat = kmalloc(v9ses->maxdata, GFP_KERNEL);
-	struct v9fs_fid *oldfid = v9fs_fid_lookup(old_dentry);
-	struct v9fs_fid *newfid = NULL;
-	char *symname = __getname();
-	int err;
+	int err, retval;
+	struct v9fs_session_info *v9ses;
+	struct v9fs_fcall *fcall;
+	struct v9fs_fid *fid;
+	struct v9fs_wstat wstat;
 
-	dprintk(DEBUG_VFS, " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name,
-		old_dentry->d_name.name);
+	v9ses = v9fs_inode2v9ses(dir);
+	retval = -EPERM;
+	fcall = NULL;
 
 	if (!v9ses->extended) {
 		dprintk(DEBUG_ERROR, "not extended\n");
-		goto FreeMem;
+		goto free_mem;
 	}
 
-	/* get fid of old_dentry */
-	sprintf(symname, "hardlink(%d)\n", oldfid->fid);
-
 	/* issue a create */
-	retval = v9fs_create(dir, dentry, V9FS_DMLINK, 0);
+	retval = v9fs_create(dir, dentry, mode, 0);
 	if (retval != 0)
-		goto FreeMem;
+		goto free_mem;
 
-	newfid = v9fs_fid_lookup(dentry);
-	if (!newfid) {
+	fid = v9fs_fid_get_created(dentry);
+	if (!fid) {
 		dprintk(DEBUG_ERROR, "couldn't resolve fid from dentry\n");
-		goto FreeMem;
+		goto free_mem;
 	}
 
-	/* issue a twstat */
-	v9fs_blank_mistat(v9ses, mistat);
-	strcpy(mistat->data + 1, symname);
-	mistat->extension = mistat->data + 1;
-	retval = v9fs_t_wstat(v9ses, newfid->fid, mistat, &fcall);
+	/* issue a Twstat */
+	v9fs_blank_wstat(&wstat);
+	wstat.muid = v9ses->name;
+	wstat.extension = (char *) extension;
+	retval = v9fs_t_wstat(v9ses, fid->fid, &wstat, &fcall);
 	if (retval < 0) {
-		dprintk(DEBUG_ERROR, "v9fs_t_wstat error: %s\n",
-			FCALL_ERROR(fcall));
-		goto FreeMem;
+		PRINT_FCALL_ERROR("wstat error", fcall);
+		goto free_mem;
 	}
 
-	kfree(fcall);
-
-	err = v9fs_t_clunk(v9ses, newfid->fid);
-
+	err = v9fs_t_clunk(v9ses, fid->fid);
 	if (err < 0) {
-		dprintk(DEBUG_ERROR, "clunk for symlink failed: %d\n", err);
-		goto FreeMem;
+		dprintk(DEBUG_ERROR, "clunk failed: %d\n", err);
+		goto free_mem;
 	}
 
 	d_drop(dentry);		/* FID - will this also clunk? */
 
+free_mem:
 	kfree(fcall);
-	fcall = NULL;
+	return retval;
+}
+
+/**
+ * v9fs_vfs_symlink - helper function to create symlinks
+ * @dir: directory inode containing symlink
+ * @dentry: dentry for symlink
+ * @symname: symlink data
+ *
+ * See 9P2000.u RFC for more information
+ *
+ */
+
+static int
+v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
+{
+	dprintk(DEBUG_VFS, " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name,
+		symname);
+
+	return v9fs_vfs_mkspecial(dir, dentry, S_IFLNK, symname);
+}
+
+/**
+ * v9fs_vfs_link - create a hardlink
+ * @old_dentry: dentry for file to link to
+ * @dir: inode destination for new link
+ * @dentry: dentry for link
+ *
+ */
+
+/* XXX - lots of code dup'd from symlink and creates,
+ * figure out a better reuse strategy
+ */
+
+static int
+v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
+	      struct dentry *dentry)
+{
+	int retval;
+	struct v9fs_fid *oldfid;
+	char *name;
+
+	dprintk(DEBUG_VFS, " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name,
+		old_dentry->d_name.name);
+
+	oldfid = v9fs_fid_lookup(old_dentry);
+	if (!oldfid) {
+		dprintk(DEBUG_ERROR, "can't find oldfid\n");
+		return -EPERM;
+	}
+
+	name = __getname();
+	sprintf(name, "hardlink(%d)\n", oldfid->fid);
+	retval = v9fs_vfs_mkspecial(dir, dentry, V9FS_DMLINK, name);
+	__putname(name);
 
-      FreeMem:
-	kfree(mistat);
-	kfree(fcall);
-	__putname(symname);
 	return retval;
 }
 
@@ -1240,83 +1094,30 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
 static int
 v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
 {
-	int retval = -EPERM;
-	struct v9fs_fid *newfid;
-	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir);
-	struct v9fs_fcall *fcall = NULL;
-	struct v9fs_stat *mistat = kmalloc(v9ses->maxdata, GFP_KERNEL);
-	char *symname = __getname();
-	int err;
+	int retval;
+	char *name;
 
 	dprintk(DEBUG_VFS, " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino,
 		dentry->d_name.name, mode, MAJOR(rdev), MINOR(rdev));
 
-	if (!mistat)
-		return -ENOMEM;
-
-	if (!new_valid_dev(rdev)) {
-		retval = -EINVAL;
-		goto FreeMem;
-	}
-
-	if (!v9ses->extended) {
-		dprintk(DEBUG_ERROR, "not extended\n");
-		goto FreeMem;
-	}
-
-	/* issue a create */
-	retval = v9fs_create(dir, dentry, mode, 0);
-
-	if (retval != 0)
-		goto FreeMem;
-
-	newfid = v9fs_fid_lookup(dentry);
-	if (!newfid) {
-		dprintk(DEBUG_ERROR, "coudn't resove fid from dentry\n");
-		retval = -EINVAL;
-		goto FreeMem;
-	}
+	if (!new_valid_dev(rdev))
+		return -EINVAL;
 
+	name = __getname();
 	/* build extension */
 	if (S_ISBLK(mode))
-		sprintf(symname, "b %u %u", MAJOR(rdev), MINOR(rdev));
+		sprintf(name, "b %u %u", MAJOR(rdev), MINOR(rdev));
 	else if (S_ISCHR(mode))
-		sprintf(symname, "c %u %u", MAJOR(rdev), MINOR(rdev));
+		sprintf(name, "c %u %u", MAJOR(rdev), MINOR(rdev));
 	else if (S_ISFIFO(mode))
-		;	/* DO NOTHING */
+		*name = 0;
 	else {
-		retval = -EINVAL;
-		goto FreeMem;
+		__putname(name);
+		return -EINVAL;
 	}
 
-	if (!S_ISFIFO(mode)) {
-		/* issue a twstat */
-		v9fs_blank_mistat(v9ses, mistat);
-		strcpy(mistat->data + 1, symname);
-		mistat->extension = mistat->data + 1;
-		retval = v9fs_t_wstat(v9ses, newfid->fid, mistat, &fcall);
-		if (retval < 0) {
-			dprintk(DEBUG_ERROR, "v9fs_t_wstat error: %s\n",
-				FCALL_ERROR(fcall));
-			goto FreeMem;
-		}
-	}
-
-	/* need to update dcache so we show up */
-	kfree(fcall);
-
-	err = v9fs_t_clunk(v9ses, newfid->fid);
-	if (err < 0) {
-		dprintk(DEBUG_ERROR, "clunk for symlink failed: %d\n", err);
-		goto FreeMem;
-	}
-
-	d_drop(dentry);		/* FID - will this also clunk? */
-
-      FreeMem:
-	kfree(mistat);
-	kfree(fcall);
-	__putname(symname);
+	retval = v9fs_vfs_mkspecial(dir, dentry, mode, name);
+	__putname(name);
 
 	return retval;
 }
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 83b6edd098..d4d71a9ca5 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -44,7 +44,6 @@
 #include "v9fs.h"
 #include "9p.h"
 #include "v9fs_vfs.h"
-#include "conv.h"
 #include "fid.h"
 
 static void v9fs_clear_inode(struct inode *);
@@ -123,10 +122,11 @@ static struct super_block *v9fs_get_sb(struct file_system_type
 
 	dprintk(DEBUG_VFS, " \n");
 
-	v9ses = kcalloc(1, sizeof(struct v9fs_session_info), GFP_KERNEL);
+	v9ses = kmalloc(sizeof(struct v9fs_session_info), GFP_KERNEL);
 	if (!v9ses)
 		return ERR_PTR(-ENOMEM);
 
+	memset(v9ses, 0, sizeof(struct v9fs_session_info));
 	if ((newfid = v9fs_session_init(v9ses, dev_name, data)) < 0) {
 		dprintk(DEBUG_ERROR, "problem initiating session\n");
 		kfree(v9ses);
@@ -168,10 +168,10 @@ static struct super_block *v9fs_get_sb(struct file_system_type
 			goto put_back_sb;
 		}
 
-		root_fid->qid = fcall->params.rstat.stat->qid;
+		root_fid->qid = fcall->params.rstat.stat.qid;
 		root->d_inode->i_ino =
-		    v9fs_qid2ino(&fcall->params.rstat.stat->qid);
-		v9fs_mistat2inode(fcall->params.rstat.stat, root->d_inode, sb);
+		    v9fs_qid2ino(&fcall->params.rstat.stat.qid);
+		v9fs_stat2inode(&fcall->params.rstat.stat, root->d_inode, sb);
 	}
 
 	kfree(fcall);
-- 
cgit v1.2.2


From 1dac06b20dcc8078dab037bd70652c69c67ba672 Mon Sep 17 00:00:00 2001
From: Latchesar Ionkov <lucho@ionkov.net>
Date: Sun, 8 Jan 2006 01:05:02 -0800
Subject: [PATCH] v9fs: handle kthread_create failure, minor bugfixes

- remove unnecessary -ENOMEM assignments
- return correct value when buf_check_size for second time in a buffer
- handle failures when create_workqueue and kthread_create are called
- use kzalloc instead of kmalloc/memset 0
- v9fs_str_copy and v9fs_str_compare were buggy, were used only in one
  place, correct the logic and move it to the place it is used.

Signed-off-by: Latchesar Ionkov <lucho@ionkov.net>
Cc: Eric Van Hensbergen <ericvh@ericvh.myip.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/9p/9p.c        | 10 ----------
 fs/9p/9p.h        |  3 ---
 fs/9p/conv.c      | 45 +++++++++------------------------------------
 fs/9p/error.h     |  1 -
 fs/9p/mux.c       | 38 ++++++++++++++++++++++++++------------
 fs/9p/mux.h       |  3 ++-
 fs/9p/v9fs.c      | 19 ++++++++++++++-----
 fs/9p/vfs_inode.c |  7 ++++++-
 fs/9p/vfs_super.c |  3 +--
 9 files changed, 58 insertions(+), 71 deletions(-)

diff --git a/fs/9p/9p.c b/fs/9p/9p.c
index dc3ce44ec8..1a6d08761f 100644
--- a/fs/9p/9p.c
+++ b/fs/9p/9p.c
@@ -86,7 +86,6 @@ v9fs_t_attach(struct v9fs_session_info *v9ses, char *uname, char *aname,
 	dprintk(DEBUG_9P, "uname '%s' aname '%s' fid %d afid %d\n", uname,
 		aname, fid, afid);
 
-	ret = -ENOMEM;
 	tc = v9fs_create_tattach(fid, afid, uname, aname);
 	if (!IS_ERR(tc)) {
 		ret = v9fs_mux_rpc(v9ses->mux, tc, rcp);
@@ -136,7 +135,6 @@ v9fs_t_clunk(struct v9fs_session_info *v9ses, u32 fid)
 
 	dprintk(DEBUG_9P, "fid %d\n", fid);
 
-	ret = -ENOMEM;
 	rc = NULL;
 	tc = v9fs_create_tclunk(fid);
 	if (!IS_ERR(tc))
@@ -165,7 +163,6 @@ int v9fs_t_flush(struct v9fs_session_info *v9ses, u16 oldtag)
 
 	dprintk(DEBUG_9P, "oldtag %d\n", oldtag);
 
-	ret = -ENOMEM;
 	tc = v9fs_create_tflush(oldtag);
 	if (!IS_ERR(tc)) {
 		ret = v9fs_mux_rpc(v9ses->mux, tc, NULL);
@@ -221,7 +218,6 @@ v9fs_t_wstat(struct v9fs_session_info *v9ses, u32 fid,
 
 	dprintk(DEBUG_9P, "fid %d\n", fid);
 
-	ret = -ENOMEM;
 	tc = v9fs_create_twstat(fid, wstat, v9ses->extended);
 	if (!IS_ERR(tc)) {
 		ret = v9fs_mux_rpc(v9ses->mux, tc, rcp);
@@ -259,7 +255,6 @@ v9fs_t_walk(struct v9fs_session_info *v9ses, u32 fid, u32 newfid,
 	else
 		nwname = 0;
 
-	ret = -ENOMEM;
 	tc = v9fs_create_twalk(fid, newfid, nwname, &name);
 	if (!IS_ERR(tc)) {
 		ret = v9fs_mux_rpc(v9ses->mux, tc, rcp);
@@ -289,7 +284,6 @@ v9fs_t_open(struct v9fs_session_info *v9ses, u32 fid, u8 mode,
 
 	dprintk(DEBUG_9P, "fid %d mode %d\n", fid, mode);
 
-	ret = -ENOMEM;
 	tc = v9fs_create_topen(fid, mode);
 	if (!IS_ERR(tc)) {
 		ret = v9fs_mux_rpc(v9ses->mux, tc, rcp);
@@ -317,7 +311,6 @@ v9fs_t_remove(struct v9fs_session_info *v9ses, u32 fid,
 
 	dprintk(DEBUG_9P, "fid %d\n", fid);
 
-	ret = -ENOMEM;
 	tc = v9fs_create_tremove(fid);
 	if (!IS_ERR(tc)) {
 		ret = v9fs_mux_rpc(v9ses->mux, tc, rcp);
@@ -349,7 +342,6 @@ v9fs_t_create(struct v9fs_session_info *v9ses, u32 fid, char *name,
 	dprintk(DEBUG_9P, "fid %d name '%s' perm %x mode %d\n",
 		fid, name, perm, mode);
 
-	ret = -ENOMEM;
 	tc = v9fs_create_tcreate(fid, name, perm, mode);
 	if (!IS_ERR(tc)) {
 		ret = v9fs_mux_rpc(v9ses->mux, tc, rcp);
@@ -380,7 +372,6 @@ v9fs_t_read(struct v9fs_session_info *v9ses, u32 fid, u64 offset,
 	dprintk(DEBUG_9P, "fid %d offset 0x%llux count 0x%x\n", fid,
 		(long long unsigned) offset, count);
 
-	ret = -ENOMEM;
 	tc = v9fs_create_tread(fid, offset, count);
 	if (!IS_ERR(tc)) {
 		ret = v9fs_mux_rpc(v9ses->mux, tc, &rc);
@@ -418,7 +409,6 @@ v9fs_t_write(struct v9fs_session_info *v9ses, u32 fid, u64 offset, u32 count,
 	dprintk(DEBUG_9P, "fid %d offset 0x%llux count 0x%x\n", fid,
 		(long long unsigned) offset, count);
 
-	ret = -ENOMEM;
 	tc = v9fs_create_twrite(fid, offset, count, data);
 	if (!IS_ERR(tc)) {
 		ret = v9fs_mux_rpc(v9ses->mux, tc, &rc);
diff --git a/fs/9p/9p.h b/fs/9p/9p.h
index 007ff63977..0cd374d947 100644
--- a/fs/9p/9p.h
+++ b/fs/9p/9p.h
@@ -340,9 +340,6 @@ struct v9fs_fcall {
 	fcall?fcall->params.rerror.error.len:0, \
 	fcall?fcall->params.rerror.error.str:"");
 
-char *v9fs_str_copy(char *buf, int buflen, struct v9fs_str *str);
-int v9fs_str_compare(char *buf, struct v9fs_str *str);
-
 int v9fs_t_version(struct v9fs_session_info *v9ses, u32 msize,
 		   char *version, struct v9fs_fcall **rcall);
 
diff --git a/fs/9p/conv.c b/fs/9p/conv.c
index f62434d435..55ccfa10ee 100644
--- a/fs/9p/conv.c
+++ b/fs/9p/conv.c
@@ -45,37 +45,6 @@ struct cbuf {
 	unsigned char *ep;
 };
 
-char *v9fs_str_copy(char *buf, int buflen, struct v9fs_str *str)
-{
-	int n;
-
-	if (buflen < str->len)
-		n = buflen;
-	else
-		n = str->len;
-
-	memmove(buf, str->str, n - 1);
-
-	return buf;
-}
-
-int v9fs_str_compare(char *buf, struct v9fs_str *str)
-{
-	int n, ret;
-
-	ret = strncmp(buf, str->str, str->len);
-
-	if (!ret) {
-		n = strlen(buf);
-		if (n < str->len)
-			ret = -1;
-		else if (n > str->len)
-			ret = 1;
-	}
-
-	return ret;
-}
-
 static inline void buf_init(struct cbuf *buf, void *data, int datalen)
 {
 	buf->sp = buf->p = data;
@@ -89,11 +58,14 @@ static inline int buf_check_overflow(struct cbuf *buf)
 
 static inline int buf_check_size(struct cbuf *buf, int len)
 {
-	if (buf->p + len > buf->ep && buf->p < buf->ep) {
-		eprintk(KERN_ERR, "buffer overflow: want %d has %d\n",
-			len, (int)(buf->ep - buf->p));
-		dump_stack();
-		buf->p = buf->ep + 1;
+	if (buf->p + len > buf->ep) {
+		if (buf->p < buf->ep) {
+			eprintk(KERN_ERR, "buffer overflow: want %d has %d\n",
+				len, (int)(buf->ep - buf->p));
+			dump_stack();
+			buf->p = buf->ep + 1;
+		}
+
 		return 0;
 	}
 
@@ -527,6 +499,7 @@ v9fs_create_common(struct cbuf *bufp, u32 size, u8 id)
 
 void v9fs_set_tag(struct v9fs_fcall *fc, u16 tag)
 {
+	fc->tag = tag;
 	*(__le16 *) (fc->sdata + 5) = cpu_to_le16(tag);
 }
 
diff --git a/fs/9p/error.h b/fs/9p/error.h
index 8b3176b3f6..a9794e85fe 100644
--- a/fs/9p/error.h
+++ b/fs/9p/error.h
@@ -176,4 +176,3 @@ static struct errormap errmap[] = {
 };
 
 extern int v9fs_error_init(void);
-extern int v9fs_errstr2errno(char *errstr, int len);
diff --git a/fs/9p/mux.c b/fs/9p/mux.c
index f21cf50839..945cb368d4 100644
--- a/fs/9p/mux.c
+++ b/fs/9p/mux.c
@@ -102,8 +102,6 @@ struct v9fs_mux_rpc {
 	wait_queue_head_t wqueue;
 };
 
-extern int v9fs_errstr2errno(char *str, int len);
-
 static int v9fs_poll_proc(void *);
 static void v9fs_read_work(void *);
 static void v9fs_write_work(void *);
@@ -119,7 +117,7 @@ static int v9fs_mux_num;
 static int v9fs_mux_poll_task_num;
 static struct v9fs_mux_poll_task v9fs_mux_poll_tasks[100];
 
-void v9fs_mux_global_init(void)
+int v9fs_mux_global_init(void)
 {
 	int i;
 
@@ -127,6 +125,10 @@ void v9fs_mux_global_init(void)
 		v9fs_mux_poll_tasks[i].task = NULL;
 
 	v9fs_mux_wq = create_workqueue("v9fs");
+	if (!v9fs_mux_wq)
+		return -ENOMEM;
+
+	return 0;
 }
 
 void v9fs_mux_global_exit(void)
@@ -156,10 +158,11 @@ inline int v9fs_mux_calc_poll_procs(int muxnum)
 	return n;
 }
 
-static void v9fs_mux_poll_start(struct v9fs_mux_data *m)
+static int v9fs_mux_poll_start(struct v9fs_mux_data *m)
 {
 	int i, n;
 	struct v9fs_mux_poll_task *vpt, *vptlast;
+	struct task_struct *pproc;
 
 	dprintk(DEBUG_MUX, "mux %p muxnum %d procnum %d\n", m, v9fs_mux_num,
 		v9fs_mux_poll_task_num);
@@ -171,13 +174,16 @@ static void v9fs_mux_poll_start(struct v9fs_mux_data *m)
 			if (v9fs_mux_poll_tasks[i].task == NULL) {
 				vpt = &v9fs_mux_poll_tasks[i];
 				dprintk(DEBUG_MUX, "create proc %p\n", vpt);
-				vpt->task =
-				    kthread_create(v9fs_poll_proc, vpt,
+				pproc = kthread_create(v9fs_poll_proc, vpt,
 						   "v9fs-poll");
-				INIT_LIST_HEAD(&vpt->mux_list);
-				vpt->muxnum = 0;
-				v9fs_mux_poll_task_num++;
-				wake_up_process(vpt->task);
+
+				if (!IS_ERR(pproc)) {
+					vpt->task = pproc;
+					INIT_LIST_HEAD(&vpt->mux_list);
+					vpt->muxnum = 0;
+					v9fs_mux_poll_task_num++;
+					wake_up_process(vpt->task);
+				}
 				break;
 			}
 		}
@@ -207,16 +213,21 @@ static void v9fs_mux_poll_start(struct v9fs_mux_data *m)
 	}
 
 	if (i >= ARRAY_SIZE(v9fs_mux_poll_tasks)) {
+		if (vptlast == NULL)
+			return -ENOMEM;
+
 		dprintk(DEBUG_MUX, "put in proc %d\n", i);
 		list_add(&m->mux_list, &vptlast->mux_list);
 		vptlast->muxnum++;
-		m->poll_task = vpt;
+		m->poll_task = vptlast;
 		memset(&m->poll_waddr, 0, sizeof(m->poll_waddr));
 		init_poll_funcptr(&m->pt, v9fs_pollwait);
 	}
 
 	v9fs_mux_num++;
 	down(&v9fs_mux_task_lock);
+
+	return 0;
 }
 
 static void v9fs_mux_poll_stop(struct v9fs_mux_data *m)
@@ -283,7 +294,10 @@ struct v9fs_mux_data *v9fs_mux_init(struct v9fs_transport *trans, int msize,
 	INIT_WORK(&m->wq, v9fs_write_work, m);
 	m->wsched = 0;
 	memset(&m->poll_waddr, 0, sizeof(m->poll_waddr));
-	v9fs_mux_poll_start(m);
+	m->poll_task = NULL;
+	n = v9fs_mux_poll_start(m);
+	if (n)
+		return ERR_PTR(n);
 
 	n = trans->poll(trans, &m->pt);
 	if (n & POLLIN) {
diff --git a/fs/9p/mux.h b/fs/9p/mux.h
index 02b13b14b0..9473b84f24 100644
--- a/fs/9p/mux.h
+++ b/fs/9p/mux.h
@@ -40,7 +40,7 @@ struct v9fs_mux_data;
 typedef void (*v9fs_mux_req_callback)(void *a, struct v9fs_fcall *tc,
 	struct v9fs_fcall *rc, int err);
 
-void v9fs_mux_global_init(void);
+int v9fs_mux_global_init(void);
 void v9fs_mux_global_exit(void);
 
 struct v9fs_mux_data *v9fs_mux_init(struct v9fs_transport *trans, int msize,
@@ -55,3 +55,4 @@ int v9fs_mux_rpcnb(struct v9fs_mux_data *m, struct v9fs_fcall *tc,
 
 void v9fs_mux_flush(struct v9fs_mux_data *m, int sendflush);
 void v9fs_mux_cancel(struct v9fs_mux_data *m, int err);
+int v9fs_errstr2errno(char *errstr, int len);
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 519b21d8b1..5250c428fc 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -269,6 +269,7 @@ v9fs_session_init(struct v9fs_session_info *v9ses,
 	int n = 0;
 	int newfid = -1;
 	int retval = -EINVAL;
+	struct v9fs_str *version;
 
 	v9ses->name = __getname();
 	if (!v9ses->name)
@@ -351,13 +352,16 @@ v9fs_session_init(struct v9fs_session_info *v9ses,
 			goto FreeFcall;
 		}
 
-		/* Really should check for 9P1 and report error */
-		if (!v9fs_str_compare("9P2000.u", &fcall->params.rversion.version)) {
+		version = &fcall->params.rversion.version;
+		if (version->len==8 && !memcmp(version->str, "9P2000.u", 8)) {
 			dprintk(DEBUG_9P, "9P2000 UNIX extensions enabled\n");
 			v9ses->extended = 1;
-		} else {
+		} else if (version->len==6 && !memcmp(version->str, "9P2000", 6)) {
 			dprintk(DEBUG_9P, "9P2000 legacy mode enabled\n");
 			v9ses->extended = 0;
+		} else {
+			retval = -EREMOTEIO;
+			goto FreeFcall;
 		}
 
 		n = fcall->params.rversion.msize;
@@ -449,12 +453,17 @@ extern int v9fs_error_init(void);
 
 static int __init init_v9fs(void)
 {
+	int ret;
+
 	v9fs_error_init();
 
 	printk(KERN_INFO "Installing v9fs 9P2000 file system support\n");
 
-	v9fs_mux_global_init();
-	return register_filesystem(&v9fs_fs_type);
+	ret = v9fs_mux_global_init();
+	if (!ret)
+		ret = register_filesystem(&v9fs_fs_type);
+
+	return ret;
 }
 
 /**
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 742bcd0dc4..d933ef1fbd 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -768,6 +768,7 @@ void
 v9fs_stat2inode(struct v9fs_stat *stat, struct inode *inode,
 	struct super_block *sb)
 {
+	int n;
 	char ext[32];
 	struct v9fs_session_info *v9ses = sb->s_fs_info;
 
@@ -791,7 +792,11 @@ v9fs_stat2inode(struct v9fs_stat *stat, struct inode *inode,
 		int major = -1;
 		int minor = -1;
 
-		v9fs_str_copy(ext, sizeof(ext), &stat->extension);
+		n = stat->extension.len;
+		if (n > sizeof(ext)-1)
+			n = sizeof(ext)-1;
+		memmove(ext, stat->extension.str, n);
+		ext[n] = 0;
 		sscanf(ext, "%c %u %u", &type, &major, &minor);
 		switch (type) {
 		case 'c':
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index d4d71a9ca5..ae0f06b3c1 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -122,11 +122,10 @@ static struct super_block *v9fs_get_sb(struct file_system_type
 
 	dprintk(DEBUG_VFS, " \n");
 
-	v9ses = kmalloc(sizeof(struct v9fs_session_info), GFP_KERNEL);
+	v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL);
 	if (!v9ses)
 		return ERR_PTR(-ENOMEM);
 
-	memset(v9ses, 0, sizeof(struct v9fs_session_info));
 	if ((newfid = v9fs_session_init(v9ses, dev_name, data)) < 0) {
 		dprintk(DEBUG_ERROR, "problem initiating session\n");
 		kfree(v9ses);
-- 
cgit v1.2.2


From 19144d0beab077950f3e767385e3c1050061b475 Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Sun, 8 Jan 2006 01:05:02 -0800
Subject: [PATCH] fix gcc4.1 build failure on xconfig
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

scripts/kconfig/qconf.h:25: error: extra qualification âConfigSettings::â on member âreadSizesâ
scripts/kconfig/qconf.h:26: error: extra qualification âConfigSettings::â on member âwriteSizesâ
scripts/kconfig/qconf.h:127: error: extra qualification âConfigList::â on member âupdateMenuListâ

Signed-off-by: Dave Jones <davej@redhat.com>
Cc: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 scripts/kconfig/qconf.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/kconfig/qconf.h b/scripts/kconfig/qconf.h
index 7c03927d4c..e52f3e90bf 100644
--- a/scripts/kconfig/qconf.h
+++ b/scripts/kconfig/qconf.h
@@ -22,8 +22,8 @@ public:
 
 #if QT_VERSION >= 300
 	void readListSettings();
-	QValueList<int> ConfigSettings::readSizes(const QString& key, bool *ok);
-	bool ConfigSettings::writeSizes(const QString& key, const QValueList<int>& value);
+	QValueList<int> readSizes(const QString& key, bool *ok);
+	bool writeSizes(const QString& key, const QValueList<int>& value);
 #endif
 
 	bool showAll;
@@ -124,7 +124,7 @@ public:
 	void setParentMenu(void);
 
 	template <class P>
-	void ConfigList::updateMenuList(P*, struct menu*);
+	void updateMenuList(P*, struct menu*);
 
 	bool updateAll;
 
-- 
cgit v1.2.2


From e0804b17984afa1504649e2535db489d2a3029b6 Mon Sep 17 00:00:00 2001
From: Eric Van Buggenhaut <ericvb@debian.org>
Date: Sun, 8 Jan 2006 01:05:03 -0800
Subject: [PATCH] hw_random: 82801AB PCI Bridge support

pci.lst lists device 80862430 as another RNG

# grep 80862430 /lib/discover/pci.lst
        80862430        bridge  i810_rng        82801AB PCI Bridge

but it's not listed in rng_pci_tbl[]

Cc: Jeff Garzik <jgarzik@pobox.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/hw_random.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/char/hw_random.c b/drivers/char/hw_random.c
index 49769f59ea..b3bc2e37e6 100644
--- a/drivers/char/hw_random.c
+++ b/drivers/char/hw_random.c
@@ -169,6 +169,7 @@ static struct pci_device_id rng_pci_tbl[] = {
 
 	{ 0x8086, 0x2418, PCI_ANY_ID, PCI_ANY_ID, 0, 0, rng_hw_intel },
 	{ 0x8086, 0x2428, PCI_ANY_ID, PCI_ANY_ID, 0, 0, rng_hw_intel },
+	{ 0x8086, 0x2430, PCI_ANY_ID, PCI_ANY_ID, 0, 0, rng_hw_intel },
 	{ 0x8086, 0x2448, PCI_ANY_ID, PCI_ANY_ID, 0, 0, rng_hw_intel },
 	{ 0x8086, 0x244e, PCI_ANY_ID, PCI_ANY_ID, 0, 0, rng_hw_intel },
 	{ 0x8086, 0x245e, PCI_ANY_ID, PCI_ANY_ID, 0, 0, rng_hw_intel },
-- 
cgit v1.2.2


From a771f2b82aa266fe578468deed82f797e26a3dc4 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Sun, 8 Jan 2006 01:05:04 -0800
Subject: [PATCH] Add a section about inlining to Documentation/CodingStyle

Adds a bit of text to Documentation/Codingstyle to state that inlining
everything "just because" is a bad idea

Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/CodingStyle | 34 +++++++++++++++++++++++++++++++---
 1 file changed, 31 insertions(+), 3 deletions(-)

diff --git a/Documentation/CodingStyle b/Documentation/CodingStyle
index 187e12077e..ce780ef648 100644
--- a/Documentation/CodingStyle
+++ b/Documentation/CodingStyle
@@ -344,7 +344,7 @@ Remember: if another thread can find your data structure, and you don't
 have a reference count on it, you almost certainly have a bug.
 
 
-		Chapter 11: Macros, Enums, Inline functions and RTL
+		Chapter 11: Macros, Enums and RTL
 
 Names of macros defining constants and labels in enums are capitalized.
 
@@ -429,7 +429,35 @@ from void pointer to any other pointer type is guaranteed by the C programming
 language.
 
 
-		Chapter 14: References
+		Chapter 14: The inline disease
+
+There appears to be a common misperception that gcc has a magic "make me
+faster" speedup option called "inline". While the use of inlines can be
+appropriate (for example as a means of replacing macros, see Chapter 11), it
+very often is not. Abundant use of the inline keyword leads to a much bigger
+kernel, which in turn slows the system as a whole down, due to a bigger
+icache footprint for the CPU and simply because there is less memory
+available for the pagecache. Just think about it; a pagecache miss causes a
+disk seek, which easily takes 5 miliseconds. There are a LOT of cpu cycles
+that can go into these 5 miliseconds.
+
+A reasonable rule of thumb is to not put inline at functions that have more
+than 3 lines of code in them. An exception to this rule are the cases where
+a parameter is known to be a compiletime constant, and as a result of this
+constantness you *know* the compiler will be able to optimize most of your
+function away at compile time. For a good example of this later case, see
+the kmalloc() inline function.
+
+Often people argue that adding inline to functions that are static and used
+only once is always a win since there is no space tradeoff. While this is
+technically correct, gcc is capable of inlining these automatically without
+help, and the maintenance issue of removing the inline when a second user
+appears outweighs the potential value of the hint that tells gcc to do
+something it would have done anyway.
+
+
+
+		Chapter 15: References
 
 The C Programming Language, Second Edition
 by Brian W. Kernighan and Dennis M. Ritchie.
@@ -453,4 +481,4 @@ Kernel CodingStyle, by greg@kroah.com at OLS 2002:
 http://www.kroah.com/linux/talks/ols_2002_kernel_codingstyle_talk/html/
 
 --
-Last updated on 16 February 2004 by a community effort on LKML.
+Last updated on 30 December 2005 by a community effort on LKML.
-- 
cgit v1.2.2


From d1c4ac408fbab78a5f89ff583b68d2d11b421bb3 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 8 Jan 2006 01:05:05 -0800
Subject: [PATCH] parport_pc: arm build fix

free_dma() isn't implemented on ARM unless HAS_DMA is set.

Cc: Russell King <rmk@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/parport/parport_pc.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/parport/parport_pc.c b/drivers/parport/parport_pc.c
index 18e85ccdae..9302b8fd74 100644
--- a/drivers/parport/parport_pc.c
+++ b/drivers/parport/parport_pc.c
@@ -2371,8 +2371,10 @@ void parport_pc_unregister_port (struct parport *p)
 	spin_lock(&ports_lock);
 	list_del_init(&priv->list);
 	spin_unlock(&ports_lock);
+#if defined(CONFIG_PARPORT_PC_FIFO) && defined(HAS_DMA)
 	if (p->dma != PARPORT_DMA_NONE)
 		free_dma(p->dma);
+#endif
 	if (p->irq != PARPORT_IRQ_NONE)
 		free_irq(p->irq, p);
 	release_region(p->base, 3);
@@ -2380,13 +2382,11 @@ void parport_pc_unregister_port (struct parport *p)
 		release_region(p->base + 3, p->size - 3);
 	if (p->modes & PARPORT_MODE_ECP)
 		release_region(p->base_hi, 3);
-#ifdef CONFIG_PARPORT_PC_FIFO
-#ifdef HAS_DMA
+#if defined(CONFIG_PARPORT_PC_FIFO) && defined(HAS_DMA)
 	if (priv->dma_buf)
 		pci_free_consistent(priv->dev, PAGE_SIZE,
 				    priv->dma_buf,
 				    priv->dma_handle);
-#endif
 #endif
 	kfree (p->private_data);
 	parport_put_port(p);
-- 
cgit v1.2.2


From d8a33496671e4533aed090793436d58debea6f3a Mon Sep 17 00:00:00 2001
From: Marko Kohtala <marko.kohtala@gmail.com>
Date: Sun, 8 Jan 2006 01:05:06 -0800
Subject: [PATCH] parport: bring back an unused phase for ppdev ioctl

Earlier fix removed unused phase, but that changed the values for other
phases.  Since these are exposed to userspace through ppdev, it is safer
not to change them.  Restore the unused phase value.

Signed-off-by: Marko Kohtala <marko.kohtala@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/parport.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/linux/parport.h b/include/linux/parport.h
index f7ff0b0c40..f67f838a3a 100644
--- a/include/linux/parport.h
+++ b/include/linux/parport.h
@@ -236,12 +236,14 @@ struct pardevice {
 
 /* IEEE1284 information */
 
-/* IEEE1284 phases */
+/* IEEE1284 phases. These are exposed to userland through ppdev IOCTL
+ * PP[GS]ETPHASE, so do not change existing values. */
 enum ieee1284_phase {
 	IEEE1284_PH_FWD_DATA,
 	IEEE1284_PH_FWD_IDLE,
 	IEEE1284_PH_TERMINATE,
 	IEEE1284_PH_NEGOTIATION,
+	IEEE1284_PH_HBUSY_DNA,
 	IEEE1284_PH_REV_IDLE,
 	IEEE1284_PH_HBUSY_DAVAIL,
 	IEEE1284_PH_REV_DATA,
-- 
cgit v1.2.2


From 6a878184c202395ea17212f111ab9ec4b5f6d6ee Mon Sep 17 00:00:00 2001
From: Jan Blunck <jblunck@suse.de>
Date: Sun, 8 Jan 2006 01:05:07 -0800
Subject: [PATCH] Eliminate __attribute__ ((packed)) warnings for gcc-4.1

Since version 4.1 the gcc is warning about ignored attributes. This patch is
using the equivalent attribute on the struct instead of on each of the
structure or union members.

GCC Manual:
  "Specifying Attributes of Types

   packed
    This attribute, attached to struct or union type definition, specifies
    that
    each member of the structure or union is placed to minimize the memory
    required. When attached to an enum definition, it indicates that the
    smallest integral type should be used.

    Specifying this attribute for struct and union types is equivalent to
    specifying the packed attribute on each of the structure or union
    members."

Signed-off-by: Jan Blunck <jblunck@suse.de>
Cc: Dave Jones <davej@codemonkey.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/isdn/hisax/hisax.h          |  18 +++---
 drivers/isdn/hisax/hisax_fcpcipnp.h |  18 +++---
 drivers/net/3c527.h                 |  50 +++++++-------
 drivers/net/irda/vlsi_ir.h          |   4 +-
 drivers/net/wan/sdla.c              |   6 +-
 include/linux/atalk.h               |  18 +++---
 include/linux/cycx_x25.h            |  66 +++++++++----------
 include/linux/if_frad.h             |  12 ++--
 include/linux/isdnif.h              |  70 ++++++++++----------
 include/linux/ncp.h                 | 126 ++++++++++++++++++------------------
 include/linux/sdla.h                |  64 +++++++++---------
 include/linux/wavefront.h           |  36 +++++------
 include/net/dn_dev.h                |  84 ++++++++++++------------
 include/net/dn_nsp.h                |  74 ++++++++++-----------
 include/sound/wavefront.h           |  36 +++++------
 15 files changed, 341 insertions(+), 341 deletions(-)

diff --git a/drivers/isdn/hisax/hisax.h b/drivers/isdn/hisax/hisax.h
index 26c545fa22..1b85ce166a 100644
--- a/drivers/isdn/hisax/hisax.h
+++ b/drivers/isdn/hisax/hisax.h
@@ -396,17 +396,17 @@ struct isar_hw {
 
 struct hdlc_stat_reg {
 #ifdef __BIG_ENDIAN
-	u_char fill __attribute__((packed));
-	u_char mode __attribute__((packed));
-	u_char xml  __attribute__((packed));
-	u_char cmd  __attribute__((packed));
+	u_char fill;
+	u_char mode;
+	u_char xml;
+	u_char cmd;
 #else
-	u_char cmd  __attribute__((packed));
-	u_char xml  __attribute__((packed));
-	u_char mode __attribute__((packed));
-	u_char fill __attribute__((packed));
+	u_char cmd;
+	u_char xml;
+	u_char mode;
+	u_char fill;
 #endif
-};
+} __attribute__((packed));
 
 struct hdlc_hw {
 	union {
diff --git a/drivers/isdn/hisax/hisax_fcpcipnp.h b/drivers/isdn/hisax/hisax_fcpcipnp.h
index bd8a22e4d6..21fbcedf3a 100644
--- a/drivers/isdn/hisax/hisax_fcpcipnp.h
+++ b/drivers/isdn/hisax/hisax_fcpcipnp.h
@@ -12,17 +12,17 @@ enum {
 
 struct hdlc_stat_reg {
 #ifdef __BIG_ENDIAN
-	u_char fill __attribute__((packed));
-	u_char mode __attribute__((packed));
-	u_char xml  __attribute__((packed));
-	u_char cmd  __attribute__((packed));
+	u_char fill;
+	u_char mode;
+	u_char xml;
+	u_char cmd;
 #else
-	u_char cmd  __attribute__((packed));
-	u_char xml  __attribute__((packed));
-	u_char mode __attribute__((packed));
-	u_char fill __attribute__((packed));
+	u_char cmd;
+	u_char xml;
+	u_char mode;
+	u_char fill;
 #endif
-};
+} __attribute__((packed));
 
 struct fritz_bcs {
 	struct hisax_b_if b_if;
diff --git a/drivers/net/3c527.h b/drivers/net/3c527.h
index c10f009ce9..53b5b071df 100644
--- a/drivers/net/3c527.h
+++ b/drivers/net/3c527.h
@@ -32,43 +32,43 @@
 
 struct mc32_mailbox
 {
-	u16	mbox __attribute((packed));
-	u16	data[1] __attribute((packed));
-};
+ 	u16 mbox;
+ 	u16 data[1];
+} __attribute((packed));
 
 struct skb_header
 {
-	u8	status __attribute((packed));
-	u8	control __attribute((packed));
-	u16	next __attribute((packed));	/* Do not change! */
-	u16	length __attribute((packed));
-	u32	data __attribute((packed));
-};
+	u8 status;
+	u8 control;
+	u16 next;	/* Do not change! */
+	u16 length;
+	u32 data;
+} __attribute((packed));
 
 struct mc32_stats
 {
 	/* RX Errors */
-	u32     rx_crc_errors       __attribute((packed)); 	
-	u32     rx_alignment_errors  __attribute((packed)); 	
-	u32     rx_overrun_errors    __attribute((packed));
-	u32     rx_tooshort_errors  __attribute((packed));
-	u32     rx_toolong_errors   __attribute((packed));
-	u32     rx_outofresource_errors  __attribute((packed)); 
+	u32 rx_crc_errors;
+	u32 rx_alignment_errors;
+	u32 rx_overrun_errors;
+	u32 rx_tooshort_errors;
+	u32 rx_toolong_errors;
+	u32 rx_outofresource_errors;
 
-	u32     rx_discarded   __attribute((packed));  /* via card pattern match filter */ 
+	u32 rx_discarded;  /* via card pattern match filter */
 
 	/* TX Errors */
-	u32     tx_max_collisions __attribute((packed)); 
-	u32     tx_carrier_errors __attribute((packed)); 
-	u32     tx_underrun_errors __attribute((packed)); 
-	u32     tx_cts_errors     __attribute((packed)); 
-	u32     tx_timeout_errors __attribute((packed)) ;
+	u32 tx_max_collisions;
+	u32 tx_carrier_errors;
+	u32 tx_underrun_errors;
+	u32 tx_cts_errors;
+	u32 tx_timeout_errors;
 	
 	/* various cruft */
-	u32     dataA[6] __attribute((packed));   
-        u16	dataB[5] __attribute((packed));   
-  	u32     dataC[14] __attribute((packed)); 	
-};
+	u32 dataA[6];
+	u16 dataB[5];
+	u32 dataC[14];
+} __attribute((packed));
 
 #define STATUS_MASK	0x0F
 #define COMPLETED	(1<<7)
diff --git a/drivers/net/irda/vlsi_ir.h b/drivers/net/irda/vlsi_ir.h
index 741aecc655..a82a4ba8de 100644
--- a/drivers/net/irda/vlsi_ir.h
+++ b/drivers/net/irda/vlsi_ir.h
@@ -577,8 +577,8 @@ struct ring_descr_hw {
 		struct {
 			u8		addr_res[3];
 			volatile u8	status;		/* descriptor status */
-		} rd_s __attribute__((packed));
-	} rd_u __attribute((packed));
+		} __attribute__((packed)) rd_s;
+	} __attribute((packed)) rd_u;
 } __attribute__ ((packed));
 
 #define rd_addr		rd_u.addr
diff --git a/drivers/net/wan/sdla.c b/drivers/net/wan/sdla.c
index 036adc4f8b..22e794071c 100644
--- a/drivers/net/wan/sdla.c
+++ b/drivers/net/wan/sdla.c
@@ -329,9 +329,9 @@ static int sdla_cpuspeed(struct net_device *dev, struct ifreq *ifr)
 
 struct _dlci_stat 
 {
-	short dlci		__attribute__((packed));
-	char  flags		__attribute__((packed));
-};
+	short dlci;
+	char  flags;
+} __attribute__((packed));
 
 struct _frad_stat 
 {
diff --git a/include/linux/atalk.h b/include/linux/atalk.h
index 911c09cb9b..6ba3aa8a81 100644
--- a/include/linux/atalk.h
+++ b/include/linux/atalk.h
@@ -155,15 +155,15 @@ struct elapaarp {
 #define AARP_REQUEST			1
 #define AARP_REPLY			2
 #define AARP_PROBE			3
-	__u8	hw_src[ETH_ALEN]	__attribute__ ((packed));
-	__u8	pa_src_zero		__attribute__ ((packed));
-	__be16	pa_src_net		__attribute__ ((packed));
-	__u8	pa_src_node		__attribute__ ((packed));
-	__u8	hw_dst[ETH_ALEN]	__attribute__ ((packed));
-	__u8	pa_dst_zero		__attribute__ ((packed));
-	__be16	pa_dst_net		__attribute__ ((packed));
-	__u8	pa_dst_node		__attribute__ ((packed));	
-};
+	__u8	hw_src[ETH_ALEN];
+	__u8	pa_src_zero;
+	__be16	pa_src_net;
+	__u8	pa_src_node;
+	__u8	hw_dst[ETH_ALEN];
+	__u8	pa_dst_zero;
+	__be16	pa_dst_net;
+	__u8	pa_dst_node;
+} __attribute__ ((packed));
 
 static __inline__ struct elapaarp *aarp_hdr(struct sk_buff *skb)
 {
diff --git a/include/linux/cycx_x25.h b/include/linux/cycx_x25.h
index b10a7f3a8c..f7a9065834 100644
--- a/include/linux/cycx_x25.h
+++ b/include/linux/cycx_x25.h
@@ -38,11 +38,11 @@ extern unsigned int cycx_debug;
 /* Data Structures */
 /* X.25 Command Block. */
 struct cycx_x25_cmd {
-	u16 command PACKED;
-	u16 link    PACKED; /* values: 0 or 1 */
-	u16 len     PACKED; /* values: 0 thru 0x205 (517) */
-	u32 buf     PACKED;
-};
+	u16 command;
+	u16 link;	/* values: 0 or 1 */
+	u16 len;	/* values: 0 thru 0x205 (517) */
+	u32 buf;
+} PACKED;
 
 /* Defines for the 'command' field. */
 #define X25_CONNECT_REQUEST             0x4401
@@ -92,34 +92,34 @@ struct cycx_x25_cmd {
  *	@flags - see dosx25.doc, in portuguese, for details
  */
 struct cycx_x25_config {
-	u8  link	PACKED;
-	u8  speed	PACKED;
-	u8  clock	PACKED;
-	u8  n2		PACKED;
-	u8  n2win	PACKED;
-	u8  n3win	PACKED;
-	u8  nvc		PACKED;
-	u8  pktlen	PACKED;
-	u8  locaddr	PACKED;
-	u8  remaddr	PACKED;
-	u16 t1		PACKED;
-	u16 t2		PACKED;
-	u8  t21		PACKED;
-	u8  npvc	PACKED;
-	u8  t23		PACKED;
-	u8  flags	PACKED;
-};
+	u8  link;
+	u8  speed;
+	u8  clock;
+	u8  n2;
+	u8  n2win;
+	u8  n3win;
+	u8  nvc;
+	u8  pktlen;
+	u8  locaddr;
+	u8  remaddr;
+	u16 t1;
+	u16 t2;
+	u8  t21;
+	u8  npvc;
+	u8  t23;
+	u8  flags;
+} PACKED;
 
 struct cycx_x25_stats {
-	u16 rx_crc_errors	PACKED;
-	u16 rx_over_errors	PACKED;
-	u16 n2_tx_frames 	PACKED;
-	u16 n2_rx_frames 	PACKED;
-	u16 tx_timeouts 	PACKED;
-	u16 rx_timeouts 	PACKED;
-	u16 n3_tx_packets 	PACKED;
-	u16 n3_rx_packets 	PACKED;
-	u16 tx_aborts	 	PACKED;
-	u16 rx_aborts	 	PACKED;
-};
+	u16 rx_crc_errors;
+	u16 rx_over_errors;
+	u16 n2_tx_frames;
+	u16 n2_rx_frames;
+	u16 tx_timeouts;
+	u16 rx_timeouts;
+	u16 n3_tx_packets;
+	u16 n3_rx_packets;
+	u16 tx_aborts;
+	u16 rx_aborts;
+} PACKED;
 #endif	/* _CYCX_X25_H */
diff --git a/include/linux/if_frad.h b/include/linux/if_frad.h
index 511999c7ee..395f0aad9c 100644
--- a/include/linux/if_frad.h
+++ b/include/linux/if_frad.h
@@ -131,17 +131,17 @@ struct frad_conf
 /* these are the fields of an RFC 1490 header */
 struct frhdr
 {
-   unsigned char  control	__attribute__((packed));
+   unsigned char  control;
 
    /* for IP packets, this can be the NLPID */
-   unsigned char  pad		__attribute__((packed)); 
+   unsigned char  pad;
 
-   unsigned char  NLPID		__attribute__((packed));
-   unsigned char  OUI[3]	__attribute__((packed));
-   unsigned short PID		__attribute__((packed));
+   unsigned char  NLPID;
+   unsigned char  OUI[3];
+   unsigned short PID;
 
 #define IP_NLPID pad 
-};
+} __attribute__((packed));
 
 /* see RFC 1490 for the definition of the following */
 #define FRAD_I_UI		0x03
diff --git a/include/linux/isdnif.h b/include/linux/isdnif.h
index 7a4eacd77c..04e10f9f14 100644
--- a/include/linux/isdnif.h
+++ b/include/linux/isdnif.h
@@ -282,43 +282,43 @@ typedef struct setup_parm {
 
 typedef struct T30_s {
 	/* session parameters */
-	__u8 resolution		__attribute__ ((packed));
-	__u8 rate		__attribute__ ((packed));
-	__u8 width		__attribute__ ((packed));
-	__u8 length		__attribute__ ((packed));
-	__u8 compression	__attribute__ ((packed));
-	__u8 ecm		__attribute__ ((packed));
-	__u8 binary		__attribute__ ((packed));
-	__u8 scantime		__attribute__ ((packed));
-	__u8 id[FAXIDLEN]	__attribute__ ((packed));
+	__u8 resolution;
+	__u8 rate;
+	__u8 width;
+	__u8 length;
+	__u8 compression;
+	__u8 ecm;
+	__u8 binary;
+	__u8 scantime;
+	__u8 id[FAXIDLEN];
 	/* additional parameters */
-	__u8 phase		__attribute__ ((packed));
-	__u8 direction		__attribute__ ((packed));
-	__u8 code		__attribute__ ((packed));
-	__u8 badlin		__attribute__ ((packed));
-	__u8 badmul		__attribute__ ((packed));
-	__u8 bor		__attribute__ ((packed));
-	__u8 fet		__attribute__ ((packed));
-	__u8 pollid[FAXIDLEN]	__attribute__ ((packed));
-	__u8 cq			__attribute__ ((packed));
-	__u8 cr			__attribute__ ((packed));
-	__u8 ctcrty		__attribute__ ((packed));
-	__u8 minsp		__attribute__ ((packed));
-	__u8 phcto		__attribute__ ((packed));
-	__u8 rel		__attribute__ ((packed));
-	__u8 nbc		__attribute__ ((packed));
+	__u8 phase;
+	__u8 direction;
+	__u8 code;
+	__u8 badlin;
+	__u8 badmul;
+	__u8 bor;
+	__u8 fet;
+	__u8 pollid[FAXIDLEN];
+	__u8 cq;
+	__u8 cr;
+	__u8 ctcrty;
+	__u8 minsp;
+	__u8 phcto;
+	__u8 rel;
+	__u8 nbc;
 	/* remote station parameters */
-	__u8 r_resolution	__attribute__ ((packed));
-	__u8 r_rate		__attribute__ ((packed));
-	__u8 r_width		__attribute__ ((packed));
-	__u8 r_length		__attribute__ ((packed));
-	__u8 r_compression	__attribute__ ((packed));
-	__u8 r_ecm		__attribute__ ((packed));
-	__u8 r_binary		__attribute__ ((packed));
-	__u8 r_scantime		__attribute__ ((packed));
-	__u8 r_id[FAXIDLEN]	__attribute__ ((packed));
-	__u8 r_code		__attribute__ ((packed));
-} T30_s;
+	__u8 r_resolution;
+	__u8 r_rate;
+	__u8 r_width;
+	__u8 r_length;
+	__u8 r_compression;
+	__u8 r_ecm;
+	__u8 r_binary;
+	__u8 r_scantime;
+	__u8 r_id[FAXIDLEN];
+	__u8 r_code;
+} __attribute__((packed)) T30_s;
 
 #define ISDN_TTY_FAX_CONN_IN	0
 #define ISDN_TTY_FAX_CONN_OUT	1
diff --git a/include/linux/ncp.h b/include/linux/ncp.h
index 99f77876b7..99f0adeeb3 100644
--- a/include/linux/ncp.h
+++ b/include/linux/ncp.h
@@ -20,29 +20,29 @@
 #define NCP_DEALLOC_SLOT_REQUEST (0x5555)
 
 struct ncp_request_header {
-	__u16 type __attribute__((packed));
-	__u8 sequence __attribute__((packed));
-	__u8 conn_low __attribute__((packed));
-	__u8 task __attribute__((packed));
-	__u8 conn_high __attribute__((packed));
-	__u8 function __attribute__((packed));
-	__u8 data[0] __attribute__((packed));
-};
+	__u16 type;
+	__u8 sequence;
+	__u8 conn_low;
+	__u8 task;
+	__u8 conn_high;
+	__u8 function;
+	__u8 data[0];
+} __attribute__((packed));
 
 #define NCP_REPLY                (0x3333)
 #define NCP_WATCHDOG		 (0x3E3E)
 #define NCP_POSITIVE_ACK         (0x9999)
 
 struct ncp_reply_header {
-	__u16 type __attribute__((packed));
-	__u8 sequence __attribute__((packed));
-	__u8 conn_low __attribute__((packed));
-	__u8 task __attribute__((packed));
-	__u8 conn_high __attribute__((packed));
-	__u8 completion_code __attribute__((packed));
-	__u8 connection_state __attribute__((packed));
-	__u8 data[0] __attribute__((packed));
-};
+	__u16 type;
+	__u8 sequence;
+	__u8 conn_low;
+	__u8 task;
+	__u8 conn_high;
+	__u8 completion_code;
+	__u8 connection_state;
+	__u8 data[0];
+} __attribute__((packed));
 
 #define NCP_VOLNAME_LEN (16)
 #define NCP_NUMBER_OF_VOLUMES (256)
@@ -128,37 +128,37 @@ struct nw_nfs_info {
 };
 
 struct nw_info_struct {
-	__u32 spaceAlloc __attribute__((packed));
-	__le32 attributes __attribute__((packed));
-	__u16 flags __attribute__((packed));
-	__le32 dataStreamSize __attribute__((packed));
-	__le32 totalStreamSize __attribute__((packed));
-	__u16 numberOfStreams __attribute__((packed));
-	__le16 creationTime __attribute__((packed));
-	__le16 creationDate __attribute__((packed));
-	__u32 creatorID __attribute__((packed));
-	__le16 modifyTime __attribute__((packed));
-	__le16 modifyDate __attribute__((packed));
-	__u32 modifierID __attribute__((packed));
-	__le16 lastAccessDate __attribute__((packed));
-	__u16 archiveTime __attribute__((packed));
-	__u16 archiveDate __attribute__((packed));
-	__u32 archiverID __attribute__((packed));
-	__u16 inheritedRightsMask __attribute__((packed));
-	__le32 dirEntNum __attribute__((packed));
-	__le32 DosDirNum __attribute__((packed));
-	__u32 volNumber __attribute__((packed));
-	__u32 EADataSize __attribute__((packed));
-	__u32 EAKeyCount __attribute__((packed));
-	__u32 EAKeySize __attribute__((packed));
-	__u32 NSCreator __attribute__((packed));
-	__u8 nameLen __attribute__((packed));
-	__u8 entryName[256] __attribute__((packed));
+	__u32 spaceAlloc;
+	__le32 attributes;
+	__u16 flags;
+	__le32 dataStreamSize;
+	__le32 totalStreamSize;
+	__u16 numberOfStreams;
+	__le16 creationTime;
+	__le16 creationDate;
+	__u32 creatorID;
+	__le16 modifyTime;
+	__le16 modifyDate;
+	__u32 modifierID;
+	__le16 lastAccessDate;
+	__u16 archiveTime;
+	__u16 archiveDate;
+	__u32 archiverID;
+	__u16 inheritedRightsMask;
+	__le32 dirEntNum;
+	__le32 DosDirNum;
+	__u32 volNumber;
+	__u32 EADataSize;
+	__u32 EAKeyCount;
+	__u32 EAKeySize;
+	__u32 NSCreator;
+	__u8 nameLen;
+	__u8 entryName[256];
 	/* libncp may depend on there being nothing after entryName */
 #ifdef __KERNEL__
 	struct nw_nfs_info nfs;
 #endif
-};
+} __attribute__((packed));
 
 /* modify mask - use with MODIFY_DOS_INFO structure */
 #define DM_ATTRIBUTES		  (cpu_to_le32(0x02))
@@ -176,26 +176,26 @@ struct nw_info_struct {
 #define DM_MAXIMUM_SPACE	  (cpu_to_le32(0x2000))
 
 struct nw_modify_dos_info {
-	__le32 attributes __attribute__((packed));
-	__le16 creationDate __attribute__((packed));
-	__le16 creationTime __attribute__((packed));
-	__u32 creatorID __attribute__((packed));
-	__le16 modifyDate __attribute__((packed));
-	__le16 modifyTime __attribute__((packed));
-	__u32 modifierID __attribute__((packed));
-	__u16 archiveDate __attribute__((packed));
-	__u16 archiveTime __attribute__((packed));
-	__u32 archiverID __attribute__((packed));
-	__le16 lastAccessDate __attribute__((packed));
-	__u16 inheritanceGrantMask __attribute__((packed));
-	__u16 inheritanceRevokeMask __attribute__((packed));
-	__u32 maximumSpace __attribute__((packed));
-};
+	__le32 attributes;
+	__le16 creationDate;
+	__le16 creationTime;
+	__u32 creatorID;
+	__le16 modifyDate;
+	__le16 modifyTime;
+	__u32 modifierID;
+	__u16 archiveDate;
+	__u16 archiveTime;
+	__u32 archiverID;
+	__le16 lastAccessDate;
+	__u16 inheritanceGrantMask;
+	__u16 inheritanceRevokeMask;
+	__u32 maximumSpace;
+} __attribute__((packed));
 
 struct nw_search_sequence {
-	__u8 volNumber __attribute__((packed));
-	__u32 dirBase __attribute__((packed));
-	__u32 sequence __attribute__((packed));
-};
+	__u8 volNumber;
+	__u32 dirBase;
+	__u32 sequence;
+} __attribute__((packed));
 
 #endif				/* _LINUX_NCP_H */
diff --git a/include/linux/sdla.h b/include/linux/sdla.h
index 3b6afb8caa..564acd3a71 100644
--- a/include/linux/sdla.h
+++ b/include/linux/sdla.h
@@ -293,46 +293,46 @@ void sdla(void *cfg_info, char *dev, struct frad_conf *conf, int quiet);
 #define SDLA_S508_INTEN			0x10
 
 struct sdla_cmd {
-   char  opp_flag		__attribute__((packed));
-   char  cmd			__attribute__((packed));
-   short length			__attribute__((packed));
-   char  retval			__attribute__((packed));
-   short dlci			__attribute__((packed));
-   char  flags			__attribute__((packed));
-   short rxlost_int		__attribute__((packed));
-   long  rxlost_app		__attribute__((packed));
-   char  reserve[2]		__attribute__((packed));
-   char  data[SDLA_MAX_DATA]	__attribute__((packed));	/* transfer data buffer */
-};
+   char  opp_flag;
+   char  cmd;
+   short length;
+   char  retval;
+   short dlci;
+   char  flags;
+   short rxlost_int;
+   long  rxlost_app;
+   char  reserve[2];
+   char  data[SDLA_MAX_DATA];	/* transfer data buffer */
+} __attribute__((packed));
 
 struct intr_info {
-   char  flags		__attribute__((packed));
-   short txlen		__attribute__((packed));
-   char  irq		__attribute__((packed));
-   char  flags2		__attribute__((packed));
-   short timeout	__attribute__((packed));
-};
+   char  flags;
+   short txlen;
+   char  irq;
+   char  flags2;
+   short timeout;
+} __attribute__((packed));
 
 /* found in the 508's control window at RXBUF_INFO */
 struct buf_info {
-   unsigned short rse_num	__attribute__((packed));
-   unsigned long  rse_base	__attribute__((packed));
-   unsigned long  rse_next	__attribute__((packed));
-   unsigned long  buf_base	__attribute__((packed));
-   unsigned short reserved	__attribute__((packed));
-   unsigned long  buf_top	__attribute__((packed));
-};
+   unsigned short rse_num;
+   unsigned long  rse_base;
+   unsigned long  rse_next;
+   unsigned long  buf_base;
+   unsigned short reserved;
+   unsigned long  buf_top;
+} __attribute__((packed));
 
 /* structure pointed to by rse_base in RXBUF_INFO struct */
 struct buf_entry {
-   char  opp_flag	__attribute__((packed));
-   short length		__attribute__((packed));
-   short dlci		__attribute__((packed));
-   char  flags		__attribute__((packed));
-   short timestamp	__attribute__((packed));
-   short reserved[2]	__attribute__((packed));
-   long  buf_addr	__attribute__((packed));
-};
+   char  opp_flag;
+   short length;
+   short dlci;
+   char  flags;
+   short timestamp;
+   short reserved[2];
+   long  buf_addr;
+} __attribute__((packed));
 
 #endif
 
diff --git a/include/linux/wavefront.h b/include/linux/wavefront.h
index 61bd0fd352..51ab3c933a 100644
--- a/include/linux/wavefront.h
+++ b/include/linux/wavefront.h
@@ -434,22 +434,22 @@ typedef struct wf_multisample {
 } wavefront_multisample;
 
 typedef struct wf_alias {
-    INT16 OriginalSample __attribute__ ((packed));
-
-    struct wf_sample_offset sampleStartOffset __attribute__ ((packed));
-    struct wf_sample_offset loopStartOffset __attribute__ ((packed));
-    struct wf_sample_offset sampleEndOffset __attribute__ ((packed));
-    struct wf_sample_offset loopEndOffset __attribute__ ((packed));
-
-    INT16  FrequencyBias __attribute__ ((packed));
-
-    UCHAR8 SampleResolution:2  __attribute__ ((packed));
-    UCHAR8 Unused1:1  __attribute__ ((packed));
-    UCHAR8 Loop:1 __attribute__ ((packed));
-    UCHAR8 Bidirectional:1  __attribute__ ((packed));
-    UCHAR8 Unused2:1 __attribute__ ((packed));
-    UCHAR8 Reverse:1 __attribute__ ((packed));
-    UCHAR8 Unused3:1 __attribute__ ((packed)); 
+    INT16 OriginalSample;
+
+    struct wf_sample_offset sampleStartOffset;
+    struct wf_sample_offset loopStartOffset;
+    struct wf_sample_offset sampleEndOffset;
+    struct wf_sample_offset loopEndOffset;
+
+    INT16  FrequencyBias;
+
+    UCHAR8 SampleResolution:2;
+    UCHAR8 Unused1:1;
+    UCHAR8 Loop:1;
+    UCHAR8 Bidirectional:1;
+    UCHAR8 Unused2:1;
+    UCHAR8 Reverse:1;
+    UCHAR8 Unused3:1;
     
     /* This structure is meant to be padded only to 16 bits on their
        original. Of course, whoever wrote their documentation didn't
@@ -460,8 +460,8 @@ typedef struct wf_alias {
        standard 16->32 bit issues.
     */
 
-    UCHAR8 sixteen_bit_padding __attribute__ ((packed));
-} wavefront_alias;
+    UCHAR8 sixteen_bit_padding;
+} __attribute__((packed)) wavefront_alias;
 
 typedef struct wf_drum {
     UCHAR8 PatchNumber;
diff --git a/include/net/dn_dev.h b/include/net/dn_dev.h
index 86e8e86e62..5a86e78081 100644
--- a/include/net/dn_dev.h
+++ b/include/net/dn_dev.h
@@ -88,8 +88,8 @@ struct dn_dev {
 	struct net_device *dev;
 	struct dn_dev_parms parms;
 	char use_long;
-        struct timer_list timer;
-        unsigned long t3;
+	struct timer_list timer;
+	unsigned long t3;
 	struct neigh_parms *neigh_parms;
 	unsigned char addr[ETH_ALEN];
 	struct neighbour *router; /* Default router on circuit */
@@ -99,57 +99,57 @@ struct dn_dev {
 
 struct dn_short_packet
 {
-	unsigned char   msgflg          __attribute__((packed));
-        unsigned short  dstnode         __attribute__((packed));
-        unsigned short  srcnode         __attribute__((packed));
-        unsigned char   forward         __attribute__((packed));
-};
+	unsigned char   msgflg;
+	unsigned short  dstnode;
+	unsigned short  srcnode;
+	unsigned char   forward;
+} __attribute__((packed));
 
 struct dn_long_packet
 {
-	unsigned char   msgflg          __attribute__((packed));
-        unsigned char   d_area          __attribute__((packed));
-        unsigned char   d_subarea       __attribute__((packed));
-        unsigned char   d_id[6]         __attribute__((packed));
-        unsigned char   s_area          __attribute__((packed));
-        unsigned char   s_subarea       __attribute__((packed));
-        unsigned char   s_id[6]         __attribute__((packed));
-        unsigned char   nl2             __attribute__((packed));
-        unsigned char   visit_ct        __attribute__((packed));
-        unsigned char   s_class         __attribute__((packed));
-        unsigned char   pt              __attribute__((packed));
-};
+	unsigned char   msgflg;
+	unsigned char   d_area;
+	unsigned char   d_subarea;
+	unsigned char   d_id[6];
+	unsigned char   s_area;
+	unsigned char   s_subarea;
+	unsigned char   s_id[6];
+	unsigned char   nl2;
+	unsigned char   visit_ct;
+	unsigned char   s_class;
+	unsigned char   pt;
+} __attribute__((packed));
 
 /*------------------------- DRP - Routing messages ---------------------*/
 
 struct endnode_hello_message
 {
-	unsigned char   msgflg          __attribute__((packed));
-        unsigned char   tiver[3]        __attribute__((packed));
-        unsigned char   id[6]           __attribute__((packed));
-        unsigned char   iinfo           __attribute__((packed));
-        unsigned short  blksize         __attribute__((packed));
-        unsigned char   area            __attribute__((packed));
-        unsigned char   seed[8]         __attribute__((packed));
-        unsigned char   neighbor[6]     __attribute__((packed));
-        unsigned short  timer           __attribute__((packed));
-        unsigned char   mpd             __attribute__((packed));
-        unsigned char   datalen         __attribute__((packed));
-        unsigned char   data[2]         __attribute__((packed));
-};
+	unsigned char   msgflg;
+	unsigned char   tiver[3];
+	unsigned char   id[6];
+	unsigned char   iinfo;
+	unsigned short  blksize;
+	unsigned char   area;
+	unsigned char   seed[8];
+	unsigned char   neighbor[6];
+	unsigned short  timer;
+	unsigned char   mpd;
+	unsigned char   datalen;
+	unsigned char   data[2];
+} __attribute__((packed));
 
 struct rtnode_hello_message
 {
-	unsigned char   msgflg          __attribute__((packed));
-        unsigned char   tiver[3]        __attribute__((packed));
-        unsigned char   id[6]           __attribute__((packed));
-        unsigned char   iinfo           __attribute__((packed));
-        unsigned short  blksize         __attribute__((packed));
-        unsigned char   priority        __attribute__((packed));
-        unsigned char   area            __attribute__((packed));
-        unsigned short  timer           __attribute__((packed));
-        unsigned char   mpd             __attribute__((packed));
-};
+	unsigned char   msgflg;
+	unsigned char   tiver[3];
+	unsigned char   id[6];
+	unsigned char   iinfo;
+	unsigned short  blksize;
+	unsigned char   priority;
+	unsigned char   area;
+	unsigned short  timer;
+	unsigned char   mpd;
+} __attribute__((packed));
 
 
 extern void dn_dev_init(void);
diff --git a/include/net/dn_nsp.h b/include/net/dn_nsp.h
index 1ba03be0af..e6182b8626 100644
--- a/include/net/dn_nsp.h
+++ b/include/net/dn_nsp.h
@@ -72,78 +72,78 @@ extern struct sk_buff *dn_alloc_send_skb(struct sock *sk, size_t *size, int nobl
 
 struct nsp_data_seg_msg
 {
-	unsigned char   msgflg          __attribute__((packed));
-	unsigned short  dstaddr         __attribute__((packed));
-	unsigned short  srcaddr         __attribute__((packed));
-};
+	unsigned char   msgflg;
+	unsigned short  dstaddr;
+	unsigned short  srcaddr;
+} __attribute__((packed));
 
 struct nsp_data_opt_msg
 {
-	unsigned short  acknum          __attribute__((packed));
-	unsigned short  segnum          __attribute__((packed));
-	unsigned short  lsflgs          __attribute__((packed));
-};
+	unsigned short  acknum;
+	unsigned short  segnum;
+	unsigned short  lsflgs;
+} __attribute__((packed));
 
 struct nsp_data_opt_msg1
 {
-	unsigned short  acknum          __attribute__((packed));
-	unsigned short  segnum          __attribute__((packed));
-};
+	unsigned short  acknum;
+	unsigned short  segnum;
+} __attribute__((packed));
 
 
 /* Acknowledgment Message (data/other data)                             */
 struct nsp_data_ack_msg
 {
-	unsigned char   msgflg          __attribute__((packed));
-	unsigned short  dstaddr         __attribute__((packed));
-	unsigned short  srcaddr         __attribute__((packed));
-	unsigned short  acknum          __attribute__((packed));
-};
+	unsigned char   msgflg;
+	unsigned short  dstaddr;
+	unsigned short  srcaddr;
+	unsigned short  acknum;
+} __attribute__((packed));
 
 /* Connect Acknowledgment Message */
 struct  nsp_conn_ack_msg
 {
-	unsigned char   msgflg          __attribute__((packed));
-	unsigned short  dstaddr         __attribute__((packed));
-};
+	unsigned char   msgflg;
+	unsigned short  dstaddr;
+} __attribute__((packed));
 
 
 /* Connect Initiate/Retransmit Initiate/Connect Confirm */
 struct  nsp_conn_init_msg
 {
-	unsigned char   msgflg          __attribute__((packed));
+	unsigned char   msgflg;
 #define NSP_CI      0x18            /* Connect Initiate     */
 #define NSP_RCI     0x68            /* Retrans. Conn Init   */
-	unsigned short  dstaddr         __attribute__((packed));
-        unsigned short  srcaddr         __attribute__((packed));
-        unsigned char   services        __attribute__((packed));
+	unsigned short  dstaddr;
+	unsigned short  srcaddr;
+	unsigned char   services;
 #define NSP_FC_NONE   0x00            /* Flow Control None    */
 #define NSP_FC_SRC    0x04            /* Seg Req. Count       */
 #define NSP_FC_SCMC   0x08            /* Sess. Control Mess   */
 #define NSP_FC_MASK   0x0c            /* FC type mask         */
-	unsigned char   info            __attribute__((packed));
-        unsigned short  segsize         __attribute__((packed));
-};
+	unsigned char   info;
+	unsigned short  segsize;
+} __attribute__((packed));
 
 /* Disconnect Initiate/Disconnect Confirm */
 struct  nsp_disconn_init_msg
 {
-	unsigned char   msgflg          __attribute__((packed));
-        unsigned short  dstaddr         __attribute__((packed));
-        unsigned short  srcaddr         __attribute__((packed));
-        unsigned short  reason          __attribute__((packed));
-};
+	unsigned char   msgflg;
+	unsigned short  dstaddr;
+	unsigned short  srcaddr;
+	unsigned short  reason;
+} __attribute__((packed));
 
 
 struct  srcobj_fmt
 {
-	char            format          __attribute__((packed));
-        unsigned char   task            __attribute__((packed));
-        unsigned short  grpcode         __attribute__((packed));
-        unsigned short  usrcode         __attribute__((packed));
-        char            dlen            __attribute__((packed));
-};
+	char            format;
+	unsigned char   task;
+	unsigned short  grpcode;
+	unsigned short  usrcode;
+	char            dlen;
+} __attribute__((packed));
 
 /*
  * A collection of functions for manipulating the sequence
diff --git a/include/sound/wavefront.h b/include/sound/wavefront.h
index 9e572aed24..15d82e594b 100644
--- a/include/sound/wavefront.h
+++ b/include/sound/wavefront.h
@@ -454,22 +454,22 @@ typedef struct wf_multisample {
 } wavefront_multisample;
 
 typedef struct wf_alias {
-    s16 OriginalSample __attribute__ ((packed));
-
-    struct wf_sample_offset sampleStartOffset __attribute__ ((packed));
-    struct wf_sample_offset loopStartOffset __attribute__ ((packed));
-    struct wf_sample_offset sampleEndOffset __attribute__ ((packed));
-    struct wf_sample_offset loopEndOffset __attribute__ ((packed));
-
-    s16  FrequencyBias __attribute__ ((packed));
-
-    u8 SampleResolution:2  __attribute__ ((packed));
-    u8 Unused1:1  __attribute__ ((packed));
-    u8 Loop:1 __attribute__ ((packed));
-    u8 Bidirectional:1  __attribute__ ((packed));
-    u8 Unused2:1 __attribute__ ((packed));
-    u8 Reverse:1 __attribute__ ((packed));
-    u8 Unused3:1 __attribute__ ((packed)); 
+    s16 OriginalSample;
+
+    struct wf_sample_offset sampleStartOffset;
+    struct wf_sample_offset loopStartOffset;
+    struct wf_sample_offset sampleEndOffset;
+    struct wf_sample_offset loopEndOffset;
+
+    s16  FrequencyBias;
+
+    u8 SampleResolution:2;
+    u8 Unused1:1;
+    u8 Loop:1;
+    u8 Bidirectional:1;
+    u8 Unused2:1;
+    u8 Reverse:1;
+    u8 Unused3:1;
     
     /* This structure is meant to be padded only to 16 bits on their
        original. Of course, whoever wrote their documentation didn't
@@ -480,8 +480,8 @@ typedef struct wf_alias {
        standard 16->32 bit issues.
     */
 
-    u8 sixteen_bit_padding __attribute__ ((packed));
-} wavefront_alias;
+    u8 sixteen_bit_padding;
+} __attribute__((packed)) wavefront_alias;
 
 typedef struct wf_drum {
     u8 PatchNumber;
-- 
cgit v1.2.2


From bfc090c468b33fb1f75c27a11efa7b7dc250556f Mon Sep 17 00:00:00 2001
From: Jan Blunck <jblunck@suse.de>
Date: Sun, 8 Jan 2006 01:05:08 -0800
Subject: [PATCH] afs: remove unnecessary __attribute__((packed))

Remove the unnecessary __attribute__((packed)) since the enum itself is packed
and not the location of it in the structure.

Signed-off-by: Jan Blunck <jblunck@suse.de>
Cc: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/afs/volume.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/fs/afs/volume.h b/fs/afs/volume.h
index 1e691889c4..bfdcf19ba3 100644
--- a/fs/afs/volume.h
+++ b/fs/afs/volume.h
@@ -18,8 +18,6 @@
 #include "kafsasyncd.h"
 #include "cache.h"
 
-#define __packed __attribute__((packed))
-
 typedef enum {
 	AFS_VLUPD_SLEEP,		/* sleeping waiting for update timer to fire */
 	AFS_VLUPD_PENDING,		/* on pending queue */
@@ -115,7 +113,7 @@ struct afs_volume
 	struct cachefs_cookie	*cache;		/* caching cookie */
 #endif
 	afs_volid_t		vid;		/* volume ID */
-	afs_voltype_t __packed	type;		/* type of volume */
+	afs_voltype_t		type;		/* type of volume */
 	char			type_force;	/* force volume type (suppress R/O -> R/W) */
 	unsigned short		nservers;	/* number of server slots filled */
 	unsigned short		rjservers;	/* number of servers discarded due to -ENOMEDIUM */
-- 
cgit v1.2.2


From 9fe656e91fd95d0893cc4831b032e0be60791bd7 Mon Sep 17 00:00:00 2001
From: Jan Blunck <jblunck@suse.de>
Date: Sun, 8 Jan 2006 01:05:09 -0800
Subject: [PATCH] i4l: __attribute__((packed)) for the CAPI message structs

The CAPI message structs itself should be packed and not the location of
single fields in the structure.

Signed-off-by: Jan Blunck <jblunck@suse.de>
Acked-by: Karsten Keil <kkeil@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/isdn/act2000/capi.h | 88 ++++++++++++++++++++++-----------------------
 1 file changed, 44 insertions(+), 44 deletions(-)

diff --git a/drivers/isdn/act2000/capi.h b/drivers/isdn/act2000/capi.h
index e82a9289ad..49f453c53c 100644
--- a/drivers/isdn/act2000/capi.h
+++ b/drivers/isdn/act2000/capi.h
@@ -78,29 +78,29 @@ typedef union  actcapi_infoel {              /* info element                 */
 typedef struct actcapi_msn {
 	__u8 eaz;
 	__u8 len;                            /* Length of MSN                */
-	__u8 msn[15] __attribute__ ((packed));
-} actcapi_msn;
+	__u8 msn[15];
+}  __attribute__((packed)) actcapi_msn;
 
 typedef struct actcapi_dlpd {
 	__u8 len;                            /* Length of structure          */
-	__u16 dlen __attribute__ ((packed)); /* Data Length                  */
-	__u8 laa __attribute__ ((packed));   /* Link Address A               */
+	__u16 dlen;                          /* Data Length                  */
+	__u8 laa;                            /* Link Address A               */
 	__u8 lab;                            /* Link Address B               */
 	__u8 modulo;                         /* Modulo Mode                  */
 	__u8 win;                            /* Window size                  */
 	__u8 xid[100];                       /* XID Information              */
-} actcapi_dlpd;
+} __attribute__((packed)) actcapi_dlpd;
 
 typedef struct actcapi_ncpd {
 	__u8   len;                          /* Length of structure          */
-	__u16  lic __attribute__ ((packed));
-	__u16  hic __attribute__ ((packed));
-	__u16  ltc __attribute__ ((packed));
-	__u16  htc __attribute__ ((packed));
-	__u16  loc __attribute__ ((packed));
-	__u16  hoc __attribute__ ((packed));
-	__u8   modulo __attribute__ ((packed));
-} actcapi_ncpd;
+	__u16  lic;
+	__u16  hic;
+	__u16  ltc;
+	__u16  htc;
+	__u16  loc;
+	__u16  hoc;
+	__u8   modulo;
+} __attribute__((packed)) actcapi_ncpd;
 #define actcapi_ncpi actcapi_ncpd
 
 /*
@@ -168,19 +168,19 @@ typedef struct actcapi_msg {
 			__u16 manuf_msg;
 			__u16 controller;
 			actcapi_msn msnmap;
-		} manufacturer_req_msn;
+		} __attribute ((packed)) manufacturer_req_msn;
 		/* TODO: TraceInit-req/conf/ind/resp and
 		 *       TraceDump-req/conf/ind/resp
 		 */
 		struct connect_req {
 			__u8  controller;
 			__u8  bchan;
-			__u32 infomask __attribute__ ((packed));
+			__u32 infomask;
 			__u8  si1;
 			__u8  si2;
 			__u8  eaz;
 			actcapi_addr addr;
-		} connect_req;
+		} __attribute__ ((packed)) connect_req;
 		struct connect_conf {
 			__u16 plci;
 			__u16 info;
@@ -192,7 +192,7 @@ typedef struct actcapi_msg {
 			__u8  si2;
 			__u8  eaz;
 			actcapi_addr addr;
-		} connect_ind;
+		} __attribute__ ((packed)) connect_ind;
 		struct connect_resp {
 			__u16 plci;
 			__u8  rejectcause;
@@ -200,14 +200,14 @@ typedef struct actcapi_msg {
 		struct connect_active_ind {
 			__u16 plci;
 			actcapi_addr addr;
-		} connect_active_ind;
+		} __attribute__ ((packed)) connect_active_ind;
 		struct connect_active_resp {
 			__u16 plci;
 		} connect_active_resp;
 		struct connect_b3_req {
 			__u16 plci;
 			actcapi_ncpi ncpi;
-		} connect_b3_req;
+		} __attribute__ ((packed)) connect_b3_req;
 		struct connect_b3_conf {
 			__u16 plci;
 			__u16 ncci;
@@ -217,12 +217,12 @@ typedef struct actcapi_msg {
 			__u16 ncci;
 			__u16 plci;
 			actcapi_ncpi ncpi;
-		} connect_b3_ind;
+		} __attribute__ ((packed)) connect_b3_ind;
 		struct connect_b3_resp {
 			__u16 ncci;
 			__u8  rejectcause;
-			actcapi_ncpi ncpi __attribute__ ((packed));
-		} connect_b3_resp;
+			actcapi_ncpi ncpi;
+		} __attribute__ ((packed)) connect_b3_resp;
 		struct disconnect_req {
 			__u16 plci;
 			__u8  cause;
@@ -241,14 +241,14 @@ typedef struct actcapi_msg {
 		struct connect_b3_active_ind {
 			__u16 ncci;
 			actcapi_ncpi ncpi;
-		} connect_b3_active_ind;
+		} __attribute__ ((packed)) connect_b3_active_ind;
 		struct connect_b3_active_resp {
 			__u16 ncci;
 		} connect_b3_active_resp;
 		struct disconnect_b3_req {
 			__u16 ncci;
 			actcapi_ncpi ncpi;
-		} disconnect_b3_req;
+		} __attribute__ ((packed)) disconnect_b3_req;
 		struct disconnect_b3_conf {
 			__u16 ncci;
 			__u16 info;
@@ -257,7 +257,7 @@ typedef struct actcapi_msg {
 			__u16 ncci;
 			__u16 info;
 			actcapi_ncpi ncpi;
-		} disconnect_b3_ind;
+		} __attribute__ ((packed)) disconnect_b3_ind;
 		struct disconnect_b3_resp {
 			__u16 ncci;
 		} disconnect_b3_resp;
@@ -265,7 +265,7 @@ typedef struct actcapi_msg {
 			__u16 plci;
 			actcapi_infonr nr;
 			actcapi_infoel el;
-		} info_ind;
+		} __attribute__ ((packed)) info_ind;
 		struct info_resp {
 			__u16 plci;
 		} info_resp;
@@ -279,8 +279,8 @@ typedef struct actcapi_msg {
 		struct select_b2_protocol_req {
 			__u16 plci;
 			__u8  protocol;
-			actcapi_dlpd dlpd __attribute__ ((packed));
-		} select_b2_protocol_req;
+			actcapi_dlpd dlpd;
+		} __attribute__ ((packed)) select_b2_protocol_req;
 		struct select_b2_protocol_conf {
 			__u16 plci;
 			__u16 info;
@@ -288,47 +288,47 @@ typedef struct actcapi_msg {
 		struct select_b3_protocol_req {
 			__u16 plci;
 			__u8  protocol;
-			actcapi_ncpd ncpd __attribute__ ((packed));
-		} select_b3_protocol_req;
+			actcapi_ncpd ncpd;
+		} __attribute__ ((packed)) select_b3_protocol_req;
 		struct select_b3_protocol_conf {
 			__u16 plci;
 			__u16 info;
 		} select_b3_protocol_conf;
 		struct listen_req {
 			__u8  controller;
-			__u32 infomask __attribute__ ((packed));  
-			__u16 eazmask __attribute__ ((packed));
-			__u16 simask __attribute__ ((packed));
-		} listen_req;
+			__u32 infomask;
+			__u16 eazmask;
+			__u16 simask;
+		} __attribute__ ((packed)) listen_req;
 		struct listen_conf {
 			__u8  controller;
-			__u16 info __attribute__ ((packed));
-		} listen_conf;
+			__u16 info;
+		} __attribute__ ((packed)) listen_conf;
 		struct data_b3_req {
 			__u16 fakencci;
 			__u16 datalen;
 			__u32 unused;
 			__u8  blocknr;
-			__u16 flags __attribute__ ((packed));
-		} data_b3_req;
+			__u16 flags;
+		} __attribute ((packed)) data_b3_req;
 		struct data_b3_ind {
 			__u16 fakencci;
 			__u16 datalen;
 			__u32 unused;
 			__u8  blocknr;
-			__u16 flags __attribute__ ((packed));
-		} data_b3_ind;
+			__u16 flags;
+		} __attribute__ ((packed)) data_b3_ind;
 		struct data_b3_resp {
 			__u16 ncci;
 			__u8  blocknr;
-		} data_b3_resp;
+		} __attribute__ ((packed)) data_b3_resp;
 		struct data_b3_conf {
 			__u16 ncci;
 			__u8  blocknr;
-			__u16 info __attribute__ ((packed));
-		} data_b3_conf;
+			__u16 info;
+		} __attribute__ ((packed)) data_b3_conf;
 	} msg;
-} actcapi_msg;
+} __attribute__ ((packed)) actcapi_msg;
 
 static inline unsigned short
 actcapi_nextsmsg(act2000_card *card)
-- 
cgit v1.2.2


From 987d4613e52e4f655278265aabbcc69237018b1d Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Sun, 8 Jan 2006 01:05:09 -0800
Subject: [PATCH] Make apm buildable without legacy pm

APM doesn't _need_ the PM_LEGACY junk, so remove it's dependancy from
Kconfig, and ifdef the junk in the code.  Whilst the ifdefs are ugly, when
the legacy stuff gets ripped out so will the ifdefs.

Signed-off-by: Dave Jones <davej@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/Kconfig      | 2 +-
 arch/i386/kernel/apm.c | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 486449e9e7..810ba8c37a 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -699,7 +699,7 @@ depends on PM && !X86_VISWS
 
 config APM
 	tristate "APM (Advanced Power Management) BIOS support"
-	depends on PM && PM_LEGACY
+	depends on PM
 	---help---
 	  APM is a BIOS specification for saving power using several different
 	  techniques. This is mostly useful for battery powered laptops with
diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c
index 2d793d4aef..9d8827156e 100644
--- a/arch/i386/kernel/apm.c
+++ b/arch/i386/kernel/apm.c
@@ -2291,7 +2291,9 @@ static int __init apm_init(void)
 		apm_info.disabled = 1;
 		return -ENODEV;
 	}
+#ifdef CONFIG_PM_LEGACY
 	pm_active = 1;
+#endif
 
 	/*
 	 * Set up a segment that references the real mode segment 0x40
@@ -2382,7 +2384,9 @@ static void __exit apm_exit(void)
 	exit_kapmd = 1;
 	while (kapmd_running)
 		schedule();
+#ifdef CONFIG_PM_LEGACY
 	pm_active = 0;
+#endif
 }
 
 module_init(apm_init);
-- 
cgit v1.2.2


From ef9ceab28203690a42d7d3915ccf6e208f0762bc Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 8 Jan 2006 01:05:10 -0800
Subject: [PATCH] remove semicolons from save_flags()

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/interrupt.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 41f150a3d2..e50a95fbeb 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -79,7 +79,7 @@ static inline void __deprecated save_flags(unsigned long *x)
 {
 	local_save_flags(*x);
 }
-#define save_flags(x) save_flags(&x);
+#define save_flags(x) save_flags(&x)
 static inline void __deprecated restore_flags(unsigned long x)
 {
 	local_irq_restore(x);
-- 
cgit v1.2.2


From 945f390f02ce44a13aefc6d9449c99f33c9286a5 Mon Sep 17 00:00:00 2001
From: Tobias Klauser <tklauser@nuerscht.ch>
Date: Sun, 8 Jan 2006 01:05:11 -0800
Subject: [PATCH] drivers/block: Use ARRAY_SIZE macro

Use ARRAY_SIZE macro instead of sizeof(x)/sizeof(x[0]) and remove a
duplicate of ARRAY_SIZE. Some trailing whitespaces are also removed.

drivers/block/acsi* has been left out as it's marked BROKEN.

Signed-off-by: Tobias Klauser <tklauser@nuerscht.ch>
Cc: Jens Axboe <axboe@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/block/DAC960.c   | 2 +-
 drivers/block/amiflop.c  | 2 +-
 drivers/block/ataflop.c  | 2 +-
 drivers/block/cciss.c    | 4 ++--
 drivers/block/cpqarray.c | 4 ++--
 drivers/block/floppy.c   | 9 ++++-----
 drivers/block/xd.c       | 6 +++---
 7 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index 179c68a3ce..4a7bb7dfce 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -3760,7 +3760,7 @@ static void DAC960_V1_ProcessCompletedCommand(DAC960_Command_T *Command)
 	      if (SenseKey == DAC960_SenseKey_VendorSpecific &&
 		  AdditionalSenseCode == 0x80 &&
 		  AdditionalSenseCodeQualifier <
-		  sizeof(DAC960_EventMessages) / sizeof(char *))
+		  ARRAY_SIZE(DAC960_EventMessages))
 		DAC960_Critical("Physical Device %d:%d %s\n", Controller,
 				EventLogEntry->Channel,
 				EventLogEntry->TargetID,
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index cb2a545e57..3c679d30b6 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -131,7 +131,7 @@ static struct fd_drive_type drive_types[] = {
 { FD_DD_5,	"DD 5.25", 40, 2, 14716, 13630, 1, 40, 81, 6, 30, 2},
 { FD_NODRIVE, "No Drive", 0, 0,     0,     0, 0,  0,  0,  0,  0, 0}
 };
-static int num_dr_types = sizeof(drive_types) / sizeof(drive_types[0]);
+static int num_dr_types = ARRAY_SIZE(drive_types);
 
 static int amiga_read(int), dos_read(int);
 static void amiga_write(int), dos_write(int);
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index 22bda05fc6..3aa68a5447 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -181,7 +181,7 @@ static struct {
 	{  6, TYPE_HD },	/* 31: H1640    <- was H1600 == h1600 for PC */
 };
 
-#define NUM_DISK_MINORS (sizeof(minor2disktype)/sizeof(*minor2disktype))
+#define NUM_DISK_MINORS ARRAY_SIZE(minor2disktype)
 
 /*
  * Maximum disk size (in kilobytes). This default is used whenever the
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 46e8356a96..74818cc650 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -103,7 +103,7 @@ static const struct pci_device_id cciss_pci_device_id[] = {
 };
 MODULE_DEVICE_TABLE(pci, cciss_pci_device_id);
 
-#define NR_PRODUCTS (sizeof(products)/sizeof(struct board_type))
+#define NR_PRODUCTS ARRAY_SIZE(products)
 
 /*  board_id = Subsystem Device ID & Vendor ID
  *  product = Marketing Name for the board
@@ -2833,7 +2833,7 @@ static int cciss_pci_init(ctlr_info_t *c, struct pci_dev *pdev)
 	c->board_id = board_id;
 
 #ifdef CCISS_DEBUG
-	print_cfg_table(c->cfgtable); 
+	print_cfg_table(c->cfgtable);
 #endif /* CCISS_DEBUG */
 
 	for(i=0; i<NR_PRODUCTS; i++) {
diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c
index 9f0664dd38..862b9abac0 100644
--- a/drivers/block/cpqarray.c
+++ b/drivers/block/cpqarray.c
@@ -72,11 +72,11 @@ static ctlr_info_t *hba[MAX_CTLR];
 
 static int eisa[8];
 
-#define NR_PRODUCTS (sizeof(products)/sizeof(struct board_type))
+#define NR_PRODUCTS ARRAY_SIZE(products)
 
 /*  board_id = Subsystem Device ID & Vendor ID
  *  product = Marketing Name for the board
- *  access = Address of the struct of function pointers 
+ *  access = Address of the struct of function pointers
  */
 static struct board_type products[] = {
 	{ 0x0040110E, "IDA",			&smart1_access },
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index b86613b21c..374621a512 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -479,7 +479,6 @@ static struct floppy_struct floppy_type[32] = {
 	{ 3200,20,2,80,0,0x1C,0x00,0xCF,0x2C,"H1600" }, /* 31 1.6MB 3.5"    */
 };
 
-#define	NUMBER(x)	(sizeof(x) / sizeof(*(x)))
 #define SECTSIZE (_FD_SECTSIZE(*floppy))
 
 /* Auto-detection: Disk type used until the next media change occurs. */
@@ -3645,7 +3644,7 @@ static void __init config_types(void)
 		const char *name = NULL;
 		static char temparea[32];
 
-		if (type < NUMBER(default_drive_params)) {
+		if (type < ARRAY_SIZE(default_drive_params)) {
 			params = &default_drive_params[type].params;
 			if (type) {
 				name = default_drive_params[type].name;
@@ -3961,7 +3960,7 @@ static void __init register_devfs_entries(int drive)
 {
 	int base_minor = (drive < 4) ? drive : (124 + drive);
 
-	if (UDP->cmos < NUMBER(default_drive_params)) {
+	if (UDP->cmos < ARRAY_SIZE(default_drive_params)) {
 		int i = 0;
 		do {
 			int minor = base_minor + (table_sup[UDP->cmos][i] << 2);
@@ -4219,7 +4218,7 @@ static struct kobject *floppy_find(dev_t dev, int *part, void *data)
 	    !(allowed_drive_mask & (1 << drive)) ||
 	    fdc_state[FDC(drive)].version == FDC_NONE)
 		return NULL;
-	if (((*part >> 2) & 0x1f) >= NUMBER(floppy_type))
+	if (((*part >> 2) & 0x1f) >= ARRAY_SIZE(floppy_type))
 		return NULL;
 	*part = 0;
 	return get_disk(disks[drive]);
@@ -4571,7 +4570,7 @@ static void unregister_devfs_entries(int drive)
 {
 	int i;
 
-	if (UDP->cmos < NUMBER(default_drive_params)) {
+	if (UDP->cmos < ARRAY_SIZE(default_drive_params)) {
 		i = 0;
 		do {
 			devfs_remove("floppy/%d%s", drive,
diff --git a/drivers/block/xd.c b/drivers/block/xd.c
index 97f5dab24b..cbce7c5e94 100644
--- a/drivers/block/xd.c
+++ b/drivers/block/xd.c
@@ -279,11 +279,11 @@ static u_char __init xd_detect (u_char *controller, unsigned int *address)
 		return(1);
 	}
 
-	for (i = 0; i < (sizeof(xd_bases) / sizeof(xd_bases[0])); i++) {
+	for (i = 0; i < ARRAY_SIZE(xd_bases); i++) {
 		void __iomem *p = ioremap(xd_bases[i], 0x2000);
 		if (!p)
 			continue;
-		for (j = 1; j < (sizeof(xd_sigs) / sizeof(xd_sigs[0])); j++) {
+		for (j = 1; j < ARRAY_SIZE(xd_sigs); j++) {
 			const char *s = xd_sigs[j].string;
 			if (check_signature(p + xd_sigs[j].offset, s, strlen(s))) {
 				*controller = j;
@@ -1018,7 +1018,7 @@ static void __init do_xd_setup (int *integers)
 		case 2: if ((integers[2] > 0) && (integers[2] < 16))
 				xd_irq = integers[2];
 		case 1: xd_override = 1;
-			if ((integers[1] >= 0) && (integers[1] < (sizeof(xd_sigs) / sizeof(xd_sigs[0]))))
+			if ((integers[1] >= 0) && (integers[1] < ARRAY_SIZE(xd_sigs)))
 				xd_type = integers[1];
 		case 0: break;
 		default:printk("xd: too many parameters for xd\n");
-- 
cgit v1.2.2


From f756d5e256059018d753f0ba79980ebeb87a1bc0 Mon Sep 17 00:00:00 2001
From: Nathan Lynch <ntl@pobox.com>
Date: Sun, 8 Jan 2006 01:05:12 -0800
Subject: [PATCH] fix workqueue oops during cpu offline

Use first_cpu(cpu_possible_map) for the single-thread workqueue case.  We
used to hardcode 0, but that broke on systems where !cpu_possible(0) when
workqueue_struct->cpu_workqueue_struct was changed from a static array to
alloc_percpu.

Commit id bce61dd49d6ba7799be2de17c772e4c701558f14 ("Fix hardcoded cpu=0 in
workqueue for per_cpu_ptr() calls") fixed that for Ben's funky sparc64
system, but it regressed my Power5.  Offlining cpu 0 oopses upon the next
call to queue_work for a single-thread workqueue, because now we try to
manipulate per_cpu_ptr(wq->cpu_wq, 1), which is uninitialized.

So we need to establish an unchanging "slot" for single-thread workqueues
which will have a valid percpu allocation.  Since alloc_percpu keys off of
cpu_possible_map, which must not change after initialization, make this
slot == first_cpu(cpu_possible_map).

Signed-off-by: Nathan Lynch <ntl@pobox.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/workqueue.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index e72fb6478d..82c4fa7059 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -29,7 +29,8 @@
 #include <linux/kthread.h>
 
 /*
- * The per-CPU workqueue (if single thread, we always use cpu 0's).
+ * The per-CPU workqueue (if single thread, we always use the first
+ * possible cpu).
  *
  * The sequence counters are for flush_scheduled_work().  It wants to wait
  * until until all currently-scheduled works are completed, but it doesn't
@@ -69,6 +70,8 @@ struct workqueue_struct {
 static DEFINE_SPINLOCK(workqueue_lock);
 static LIST_HEAD(workqueues);
 
+static int singlethread_cpu;
+
 /* If it's single threaded, it isn't in the list of workqueues. */
 static inline int is_single_threaded(struct workqueue_struct *wq)
 {
@@ -102,7 +105,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
 
 	if (!test_and_set_bit(0, &work->pending)) {
 		if (unlikely(is_single_threaded(wq)))
-			cpu = any_online_cpu(cpu_online_map);
+			cpu = singlethread_cpu;
 		BUG_ON(!list_empty(&work->entry));
 		__queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
 		ret = 1;
@@ -118,7 +121,7 @@ static void delayed_work_timer_fn(unsigned long __data)
 	int cpu = smp_processor_id();
 
 	if (unlikely(is_single_threaded(wq)))
-		cpu = any_online_cpu(cpu_online_map);
+		cpu = singlethread_cpu;
 
 	__queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
 }
@@ -267,7 +270,7 @@ void fastcall flush_workqueue(struct workqueue_struct *wq)
 
 	if (is_single_threaded(wq)) {
 		/* Always use first cpu's area. */
-		flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, any_online_cpu(cpu_online_map)));
+		flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, singlethread_cpu));
 	} else {
 		int cpu;
 
@@ -325,7 +328,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
 	lock_cpu_hotplug();
 	if (singlethread) {
 		INIT_LIST_HEAD(&wq->list);
-		p = create_workqueue_thread(wq, any_online_cpu(cpu_online_map));
+		p = create_workqueue_thread(wq, singlethread_cpu);
 		if (!p)
 			destroy = 1;
 		else
@@ -379,7 +382,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
 	/* We don't need the distraction of CPUs appearing and vanishing. */
 	lock_cpu_hotplug();
 	if (is_single_threaded(wq))
-		cleanup_workqueue_thread(wq, any_online_cpu(cpu_online_map));
+		cleanup_workqueue_thread(wq, singlethread_cpu);
 	else {
 		for_each_online_cpu(cpu)
 			cleanup_workqueue_thread(wq, cpu);
@@ -567,6 +570,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 
 void init_workqueues(void)
 {
+	singlethread_cpu = first_cpu(cpu_possible_map);
 	hotcpu_notifier(workqueue_cpu_callback, 0);
 	keventd_wq = create_workqueue("events");
 	BUG_ON(!keventd_wq);
-- 
cgit v1.2.2


From 96e9dd14a3b2aab4098503a5999ee2ef42d82da1 Mon Sep 17 00:00:00 2001
From: Ben Collins <bcollins@ubuntu.com>
Date: Sun, 8 Jan 2006 01:05:13 -0800
Subject: [PATCH] kconf: Check for eof from input stream.

Signed-off-by: Ben Collins <bcollins@ubuntu.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 scripts/kconfig/conf.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/scripts/kconfig/conf.c b/scripts/kconfig/conf.c
index 8ba5d29d3d..10eeae53d8 100644
--- a/scripts/kconfig/conf.c
+++ b/scripts/kconfig/conf.c
@@ -63,6 +63,20 @@ static void check_stdin(void)
 	}
 }
 
+static char *fgets_check_stream(char *s, int size, FILE *stream)
+{
+	char *ret = fgets(s, size, stream);
+
+	if (ret == NULL && feof(stream)) {
+		printf(_("aborted!\n\n"));
+		printf(_("Console input is closed. "));
+		printf(_("Run 'make oldconfig' to update configuration.\n\n"));
+		exit(1);
+	}
+
+	return ret;
+}
+
 static void conf_askvalue(struct symbol *sym, const char *def)
 {
 	enum symbol_type type = sym_get_type(sym);
@@ -100,7 +114,7 @@ static void conf_askvalue(struct symbol *sym, const char *def)
 		check_stdin();
 	case ask_all:
 		fflush(stdout);
-		fgets(line, 128, stdin);
+		fgets_check_stream(line, 128, stdin);
 		return;
 	case set_default:
 		printf("%s\n", def);
@@ -356,7 +370,7 @@ static int conf_choice(struct menu *menu)
 			check_stdin();
 		case ask_all:
 			fflush(stdout);
-			fgets(line, 128, stdin);
+			fgets_check_stream(line, 128, stdin);
 			strip(line);
 			if (line[0] == '?') {
 				printf("\n%s\n", menu->sym->help ?
-- 
cgit v1.2.2


From 86f91d36c2aac910b7cf7e5d047019b10a9a627e Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 8 Jan 2006 01:05:14 -0800
Subject: [PATCH] i810_audio: request_irq() fix

Request the IRQ after having set everything up.  Otherwise a shared interrupt
at the right time can kill the machine.

Found this with David Woodhouse <dwmw2@infradead.org>'s
debug-shared-irqs.patch.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 sound/oss/i810_audio.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/sound/oss/i810_audio.c b/sound/oss/i810_audio.c
index b9a640fe48..4600cd6742 100644
--- a/sound/oss/i810_audio.c
+++ b/sound/oss/i810_audio.c
@@ -3359,12 +3359,6 @@ static int __devinit i810_probe(struct pci_dev *pci_dev, const struct pci_device
 		goto out_region2;
 	}
 
-	if (request_irq(card->irq, &i810_interrupt, SA_SHIRQ,
-			card_names[pci_id->driver_data], card)) {
-		printk(KERN_ERR "i810_audio: unable to allocate irq %d\n", card->irq);
-		goto out_pio;
-	}
-
 	if (card->use_mmio) {
 		if (request_mem_region(card->ac97base_mmio_phys, 512, "ich_audio MMBAR")) {
 			if ((card->ac97base_mmio = ioremap(card->ac97base_mmio_phys, 512))) { /*@FIXME can ioremap fail? don't know (jsaw) */
@@ -3395,10 +3389,8 @@ static int __devinit i810_probe(struct pci_dev *pci_dev, const struct pci_device
 	}
 
 	/* initialize AC97 codec and register /dev/mixer */
-	if (i810_ac97_init(card) <= 0) {
-		free_irq(card->irq, card);
+	if (i810_ac97_init(card) <= 0)
 		goto out_iospace;
-	}
 	pci_set_drvdata(pci_dev, card);
 
 	if(clocking == 0) {
@@ -3410,7 +3402,6 @@ static int __devinit i810_probe(struct pci_dev *pci_dev, const struct pci_device
 	if ((card->dev_audio = register_sound_dsp(&i810_audio_fops, -1)) < 0) {
 		int i;
 		printk(KERN_ERR "i810_audio: couldn't register DSP device!\n");
-		free_irq(card->irq, card);
 		for (i = 0; i < NR_AC97; i++)
 		if (card->ac97_codec[i] != NULL) {
 			unregister_sound_mixer(card->ac97_codec[i]->dev_mixer);
@@ -3419,6 +3410,13 @@ static int __devinit i810_probe(struct pci_dev *pci_dev, const struct pci_device
 		goto out_iospace;
 	}
 
+	if (request_irq(card->irq, &i810_interrupt, SA_SHIRQ,
+			card_names[pci_id->driver_data], card)) {
+		printk(KERN_ERR "i810_audio: unable to allocate irq %d\n", card->irq);
+		goto out_iospace;
+	}
+
+
  	card->initializing = 0;
 	return 0;
 
-- 
cgit v1.2.2


From 0f59cc4a35dbbc45c972daad0f1b063380cd9ea4 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 8 Jan 2006 01:05:15 -0800
Subject: [PATCH] simplify k_getrusage()

Factor out common code for different RUSAGE_xxx cases.

Don't take ->sighand->siglock in RUSAGE_SELF case, suggested by Ravikiran G
Thirumalai <kiran@scalex86.org>.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sys.c | 29 +++++++++++------------------
 1 file changed, 11 insertions(+), 18 deletions(-)

diff --git a/kernel/sys.c b/kernel/sys.c
index 218937e837..b6941e06d5 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1692,7 +1692,10 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 	if (unlikely(!p->signal))
 		return;
 
+	utime = stime = cputime_zero;
+
 	switch (who) {
+		case RUSAGE_BOTH:
 		case RUSAGE_CHILDREN:
 			spin_lock_irqsave(&p->sighand->siglock, flags);
 			utime = p->signal->cutime;
@@ -1702,22 +1705,11 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 			r->ru_minflt = p->signal->cmin_flt;
 			r->ru_majflt = p->signal->cmaj_flt;
 			spin_unlock_irqrestore(&p->sighand->siglock, flags);
-			cputime_to_timeval(utime, &r->ru_utime);
-			cputime_to_timeval(stime, &r->ru_stime);
-			break;
+
+			if (who == RUSAGE_CHILDREN)
+				break;
+
 		case RUSAGE_SELF:
-			spin_lock_irqsave(&p->sighand->siglock, flags);
-			utime = stime = cputime_zero;
-			goto sum_group;
-		case RUSAGE_BOTH:
-			spin_lock_irqsave(&p->sighand->siglock, flags);
-			utime = p->signal->cutime;
-			stime = p->signal->cstime;
-			r->ru_nvcsw = p->signal->cnvcsw;
-			r->ru_nivcsw = p->signal->cnivcsw;
-			r->ru_minflt = p->signal->cmin_flt;
-			r->ru_majflt = p->signal->cmaj_flt;
-		sum_group:
 			utime = cputime_add(utime, p->signal->utime);
 			stime = cputime_add(stime, p->signal->stime);
 			r->ru_nvcsw += p->signal->nvcsw;
@@ -1734,13 +1726,14 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 				r->ru_majflt += t->maj_flt;
 				t = next_thread(t);
 			} while (t != p);
-			spin_unlock_irqrestore(&p->sighand->siglock, flags);
-			cputime_to_timeval(utime, &r->ru_utime);
-			cputime_to_timeval(stime, &r->ru_stime);
 			break;
+
 		default:
 			BUG();
 	}
+
+	cputime_to_timeval(utime, &r->ru_utime);
+	cputime_to_timeval(stime, &r->ru_stime);
 }
 
 int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
-- 
cgit v1.2.2


From 6e21bd9a30f1d7eab26ef8b9259376e5322d89e5 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Sun, 8 Jan 2006 01:05:15 -0800
Subject: [PATCH] drivers/isdn/: add missing #includes

Every file should #include the headers containing the prototypes for its
global functions.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Armin Schindler <armin@melware.de>
Cc: Karsten Keil <kkeil@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/isdn/capi/capifs.c           | 2 ++
 drivers/isdn/hardware/eicon/os_bri.c | 1 +
 drivers/isdn/hardware/eicon/os_pri.c | 1 +
 3 files changed, 4 insertions(+)

diff --git a/drivers/isdn/capi/capifs.c b/drivers/isdn/capi/capifs.c
index 7b564c0dd9..207cae3662 100644
--- a/drivers/isdn/capi/capifs.c
+++ b/drivers/isdn/capi/capifs.c
@@ -17,6 +17,8 @@
 #include <linux/ctype.h>
 #include <linux/sched.h>	/* current */
 
+#include "capifs.h"
+
 MODULE_DESCRIPTION("CAPI4Linux: /dev/capi/ filesystem");
 MODULE_AUTHOR("Carsten Paeth");
 MODULE_LICENSE("GPL");
diff --git a/drivers/isdn/hardware/eicon/os_bri.c b/drivers/isdn/hardware/eicon/os_bri.c
index 4cc44a5dd1..f31bba5b16 100644
--- a/drivers/isdn/hardware/eicon/os_bri.c
+++ b/drivers/isdn/hardware/eicon/os_bri.c
@@ -16,6 +16,7 @@
 #include "diva_pci.h"
 #include "mi_pc.h"
 #include "pc_maint.h"
+#include "dsrv_bri.h"
 
 /*
 **  IMPORTS
diff --git a/drivers/isdn/hardware/eicon/os_pri.c b/drivers/isdn/hardware/eicon/os_pri.c
index 8ac207f75e..a296a846f2 100644
--- a/drivers/isdn/hardware/eicon/os_pri.c
+++ b/drivers/isdn/hardware/eicon/os_pri.c
@@ -18,6 +18,7 @@
 #include "pc_maint.h"
 #include "dsp_tst.h"
 #include "diva_dma.h"
+#include "dsrv_pri.h"
 
 /* --------------------------------------------------------------------------
    OS Dependent part of XDI driver for DIVA PRI Adapter
-- 
cgit v1.2.2


From 7019e7e41e34388cb87a5f0e3d05e92992418f34 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Sun, 8 Jan 2006 01:05:16 -0800
Subject: [PATCH] drivers/isdn/hardware/eicon/os_4bri.c: correct the
 xdiLoadFile() signature

It's not good if caller and callee disagree regarding the type of the
arguments.

In this case, this could cause problems on 64bit architectures.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Armin Schindler <armin@melware.de>
Cc: Karsten Keil <kkeil@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/isdn/hardware/eicon/os_4bri.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/isdn/hardware/eicon/os_4bri.c b/drivers/isdn/hardware/eicon/os_4bri.c
index cccfabc111..11e6f937c1 100644
--- a/drivers/isdn/hardware/eicon/os_4bri.c
+++ b/drivers/isdn/hardware/eicon/os_4bri.c
@@ -16,6 +16,7 @@
 #include "diva_pci.h"
 #include "mi_pc.h"
 #include "dsrv4bri.h"
+#include "helpers.h"
 
 static void *diva_xdiLoadFileFile = NULL;
 static dword diva_xdiLoadFileLength = 0;
@@ -815,7 +816,7 @@ diva_4bri_cmd_card_proc(struct _diva_os_xdi_adapter *a,
 	return (ret);
 }
 
-void *xdiLoadFile(char *FileName, unsigned long *FileLength,
+void *xdiLoadFile(char *FileName, dword *FileLength,
 		  unsigned long lim)
 {
 	void *ret = diva_xdiLoadFileFile;
-- 
cgit v1.2.2


From 564de74a7e2778e1d11b2d2e50ee8dab1cf1456a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 8 Jan 2006 01:05:17 -0800
Subject: [PATCH] cciss: avoid defining useless MAJOR_NR macro

This sneaked in with one of the updates.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/block/cciss.c | 2 +-
 drivers/block/cciss.h | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 74818cc650..88452c79fb 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -3118,7 +3118,7 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
 	 * 8 controller support.
 	 */
 	if (i < MAX_CTLR_ORIG)
-		hba[i]->major = MAJOR_NR + i;
+		hba[i]->major = COMPAQ_CISS_MAJOR + i;
 	rc = register_blkdev(hba[i]->major, hba[i]->devname);
 	if(rc == -EBUSY || rc == -EINVAL) {
 		printk(KERN_ERR
diff --git a/drivers/block/cciss.h b/drivers/block/cciss.h
index ad45e581a9..b24fc0553c 100644
--- a/drivers/block/cciss.h
+++ b/drivers/block/cciss.h
@@ -13,8 +13,6 @@
 #define IO_OK		0
 #define IO_ERROR	1
 
-#define MAJOR_NR COMPAQ_CISS_MAJOR
-
 struct ctlr_info;
 typedef struct ctlr_info ctlr_info_t;
 
-- 
cgit v1.2.2


From e701e85b9f12af4dbe73b4299b541686ef6b1aa5 Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Sun, 8 Jan 2006 01:05:18 -0800
Subject: [PATCH] vr41xx: section tags fix

module_init functions must be tagged __init rather than __devinit; likewise,
module_exit functions must be tagged __exit rather than __devexit.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Cc: Yoichi Yuasa <yuasa@hh.iij4u.or.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/vr41xx_giu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/char/vr41xx_giu.c b/drivers/char/vr41xx_giu.c
index 9ac6d43437..a5b18e086a 100644
--- a/drivers/char/vr41xx_giu.c
+++ b/drivers/char/vr41xx_giu.c
@@ -718,7 +718,7 @@ static struct platform_driver giu_device_driver = {
 	},
 };
 
-static int __devinit vr41xx_giu_init(void)
+static int __init vr41xx_giu_init(void)
 {
 	int retval;
 
@@ -733,7 +733,7 @@ static int __devinit vr41xx_giu_init(void)
 	return retval;
 }
 
-static void __devexit vr41xx_giu_exit(void)
+static void __exit vr41xx_giu_exit(void)
 {
 	platform_driver_unregister(&giu_device_driver);
 
-- 
cgit v1.2.2


From d960600df3ce3588571e2c1adf1f5f6d8ca9eb5a Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Sun, 8 Jan 2006 01:05:19 -0800
Subject: [PATCH] tiny: Add bloat-o-meter to scripts

This is a rewrite of Andi Kleen's bloat-o-meter with sorting and reporting of
gainers/decliners.  Sample output:

add/remove: 0/8 grow/shrink: 2/0 up/down: 88/-4424 (-4336)
function                                     old     new   delta
__copy_to_user_ll                             59     103     +44
__copy_from_user_ll                           59     103     +44
fill_note                                     32       -     -32
maydump                                       58       -     -58
dump_seek                                     67       -     -67
writenote                                    180       -    -180
elf_dump_thread_status                       274       -    -274
fill_psinfo                                  308       -    -308
fill_prstatus                                466       -    -466
elf_core_dump                               3039       -   -3039

The summary line says:
 no functions added, 8 removed
 two functions grew, none shrunk
 we gained 88 bytes and lost 4424 (or -4336 net)

This work was sponsored in part by CE Linux Forum

Signed-off-by: Matt Mackall <mpm@selenic.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 scripts/bloat-o-meter | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)
 create mode 100644 scripts/bloat-o-meter

diff --git a/scripts/bloat-o-meter b/scripts/bloat-o-meter
new file mode 100644
index 0000000000..75f21d843c
--- /dev/null
+++ b/scripts/bloat-o-meter
@@ -0,0 +1,58 @@
+#!/usr/bin/python
+#
+# Copyright 2004 Matt Mackall <mpm@selenic.com>
+#
+# inspired by perl Bloat-O-Meter (c) 1997 by Andi Kleen
+#
+# This software may be used and distributed according to the terms
+# of the GNU General Public License, incorporated herein by reference.
+
+import sys, os, re
+
+if len(sys.argv) != 3:
+    sys.stderr.write("usage: %s file1 file2\n" % sys.argv[0])
+    sys.exit(-1)
+
+def getsizes(file):
+    sym = {}
+    for l in os.popen("nm --size-sort " + file).readlines():
+        size, type, name = l[:-1].split()
+        if type in "tTdDbB":
+            sym[name] = int(size, 16)
+    return sym
+
+old = getsizes(sys.argv[1])
+new = getsizes(sys.argv[2])
+grow, shrink, add, remove, up, down = 0, 0, 0, 0, 0, 0
+delta, common = [], {}
+
+for a in old:
+    if a in new:
+        common[a] = 1
+
+for name in old:
+    if name not in common:
+        remove += 1
+        down += old[name]
+        delta.append((-old[name], name))
+
+for name in new:
+    if name not in common:
+        add += 1
+        up += new[name]
+        delta.append((new[name], name))
+
+for name in common:
+        d = new.get(name, 0) - old.get(name, 0)
+        if d>0: grow, up = grow+1, up+d
+        if d<0: shrink, down = shrink+1, down-d
+        delta.append((d, name))
+
+delta.sort()
+delta.reverse()
+
+print "add/remove: %s/%s grow/shrink: %s/%s up/down: %s/%s (%s)" % \
+      (add, remove, grow, shrink, up, -down, up-down)
+print "%-40s %7s %7s %+7s" % ("function", "old", "new", "delta")
+for d, n in delta:
+    if d: print "%-40s %7s %7s %+7d" % (n, old.get(n,"-"), new.get(n,"-"), d)
-- 
cgit v1.2.2


From b01ec0ef63e95570e2463b26333d9c9c854cb941 Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Sun, 8 Jan 2006 01:05:20 -0800
Subject: [PATCH] tiny: Uninline some open.c functions

uninline some open.c functions

add/remove: 3/0 grow/shrink: 0/6 up/down: 679/-1166 (-487)
function                                     old     new   delta
do_sys_truncate                                -     336    +336
do_sys_ftruncate                               -     317    +317
__put_unused_fd                                -      26     +26
put_unused_fd                                 57      49      -8
sys_close                                    150     119     -31
sys_ftruncate64                              260      26    -234
sys_ftruncate                                272      24    -248
sys_truncate                                 339      25    -314
sys_truncate64                               336       5    -331

Signed-off-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/open.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/open.c b/fs/open.c
index 94968cb3af..75f3329e8a 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -217,7 +217,7 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
 	return err;
 }
 
-static inline long do_sys_truncate(const char __user * path, loff_t length)
+static long do_sys_truncate(const char __user * path, loff_t length)
 {
 	struct nameidata nd;
 	struct inode * inode;
@@ -283,7 +283,7 @@ asmlinkage long sys_truncate(const char __user * path, unsigned long length)
 	return do_sys_truncate(path, (long)length);
 }
 
-static inline long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
+static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
 {
 	struct inode * inode;
 	struct dentry *dentry;
@@ -971,7 +971,7 @@ out:
 
 EXPORT_SYMBOL(get_unused_fd);
 
-static inline void __put_unused_fd(struct files_struct *files, unsigned int fd)
+static void __put_unused_fd(struct files_struct *files, unsigned int fd)
 {
 	struct fdtable *fdt = files_fdtable(files);
 	__FD_CLR(fd, fdt->open_fds);
-- 
cgit v1.2.2


From 5d2bea4582d20cb24085152acaa29b95c05cdcf8 Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Sun, 8 Jan 2006 01:05:21 -0800
Subject: [PATCH] tiny: Uninline some inode.c functions

uninline a couple inode.c functions

add/remove: 2/0 grow/shrink: 0/5 up/down: 256/-428 (-172)
function                                     old     new   delta
ifind                                          -     136    +136
ifind_fast                                     -     120    +120
ilookup5_nowait                              131      80     -51
ilookup                                      158      71     -87
ilookup5                                     171      80     -91
iget_locked                                  190      95     -95
iget5_locked                                 240     136    -104

Signed-off-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/inode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/inode.c b/fs/inode.c
index d8d04bd72b..fd568caf7f 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -770,7 +770,7 @@ EXPORT_SYMBOL(igrab);
  *
  * Note, @test is called with the inode_lock held, so can't sleep.
  */
-static inline struct inode *ifind(struct super_block *sb,
+static struct inode *ifind(struct super_block *sb,
 		struct hlist_head *head, int (*test)(struct inode *, void *),
 		void *data, const int wait)
 {
@@ -804,7 +804,7 @@ static inline struct inode *ifind(struct super_block *sb,
  *
  * Otherwise NULL is returned.
  */
-static inline struct inode *ifind_fast(struct super_block *sb,
+static struct inode *ifind_fast(struct super_block *sb,
 		struct hlist_head *head, unsigned long ino)
 {
 	struct inode *inode;
-- 
cgit v1.2.2


From 33443c42f4ffa5ca23b3323234bcb1a78e85d9db Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Sun, 8 Jan 2006 01:05:22 -0800
Subject: [PATCH] tiny: Uninline some fslocks.c functions

uninline some file locking functions

add/remove: 3/0 grow/shrink: 0/15 up/down: 256/-1525 (-1269)
function                                     old     new   delta
locks_free_lock                                -     134    +134
posix_same_owner                               -      69     +69
__locks_delete_block                           -      53     +53
posix_locks_conflict                         126     108     -18
locks_remove_posix                           266     237     -29
locks_wake_up_blocks                         121      87     -34
locks_block_on_timeout                        83      47     -36
locks_insert_block                           157     120     -37
locks_delete_block                            62      23     -39
posix_unblock_lock                           104      59     -45
posix_locks_deadlock                         162     100     -62
locks_delete_lock                            228     119    -109
sys_flock                                    338     217    -121
__break_lease                                600     474    -126
lease_init                                   252     122    -130
fcntl_setlk64                                793     649    -144
fcntl_setlk                                  793     649    -144
__posix_lock_file                           1477    1026    -451

Signed-off-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/locks.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/fs/locks.c b/fs/locks.c
index fb32d6218e..909eab8fb1 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -154,7 +154,7 @@ static struct file_lock *locks_alloc_lock(void)
 }
 
 /* Free a lock which is not in use. */
-static inline void locks_free_lock(struct file_lock *fl)
+static void locks_free_lock(struct file_lock *fl)
 {
 	if (fl == NULL) {
 		BUG();
@@ -475,8 +475,7 @@ static inline int locks_overlap(struct file_lock *fl1, struct file_lock *fl2)
 /*
  * Check whether two locks have the same owner.
  */
-static inline int
-posix_same_owner(struct file_lock *fl1, struct file_lock *fl2)
+static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2)
 {
 	if (fl1->fl_lmops && fl1->fl_lmops->fl_compare_owner)
 		return fl2->fl_lmops == fl1->fl_lmops &&
@@ -487,7 +486,7 @@ posix_same_owner(struct file_lock *fl1, struct file_lock *fl2)
 /* Remove waiter from blocker's block list.
  * When blocker ends up pointing to itself then the list is empty.
  */
-static inline void __locks_delete_block(struct file_lock *waiter)
+static void __locks_delete_block(struct file_lock *waiter)
 {
 	list_del_init(&waiter->fl_block);
 	list_del_init(&waiter->fl_link);
-- 
cgit v1.2.2


From 18e92b12e83bef077a31ba6871f1aec3621b69e3 Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Sun, 8 Jan 2006 01:05:22 -0800
Subject: [PATCH] tiny: Trim non-IPX builds

trivial: drop unused 802.3 code if we compile without IPX

(originally from http://wohnheim.fh-wedel.de/~joern/software/kernel/je/25/)

Signed-off-by: Matt Mackall <mpm@selenic.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Arnaldo Carvalho de Melo <acme@conectiva.com.br>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 net/802/Makefile | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/net/802/Makefile b/net/802/Makefile
index 0186192959..977704a54f 100644
--- a/net/802/Makefile
+++ b/net/802/Makefile
@@ -2,8 +2,6 @@
 # Makefile for the Linux 802.x protocol layers.
 #
 
-obj-y			:= p8023.o
-
 # Check the p8022 selections against net/core/Makefile.
 obj-$(CONFIG_SYSCTL)	+= sysctl_net_802.o
 obj-$(CONFIG_LLC)	+= p8022.o psnap.o
@@ -11,5 +9,5 @@ obj-$(CONFIG_TR)	+= p8022.o psnap.o tr.o sysctl_net_802.o
 obj-$(CONFIG_NET_FC)	+=                 fc.o
 obj-$(CONFIG_FDDI)	+=                 fddi.o
 obj-$(CONFIG_HIPPI)	+=                 hippi.o
-obj-$(CONFIG_IPX)	+= p8022.o psnap.o
+obj-$(CONFIG_IPX)	+= p8022.o psnap.o p8023.o
 obj-$(CONFIG_ATALK)	+= p8022.o psnap.o
-- 
cgit v1.2.2


From 22c4e3084eb8b88288a622a57d8b35c450a439f2 Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Sun, 8 Jan 2006 01:05:24 -0800
Subject: [PATCH] tiny: Make x86 doublefault handling optional

This adds configurable support for doublefault reporting on x86

add/remove: 0/3 grow/shrink: 0/1 up/down: 0/-13048 (-13048)
function                                     old     new   delta
cpu_init                                     846     786     -60
doublefault_fn                               188       -    -188
doublefault_stack                           4096       -   -4096
doublefault_tss                             8704       -   -8704

Signed-off-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/Makefile     | 3 ++-
 arch/i386/kernel/cpu/common.c | 2 ++
 init/Kconfig                  | 9 +++++++++
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile
index f10de0f2c5..4f40589e17 100644
--- a/arch/i386/kernel/Makefile
+++ b/arch/i386/kernel/Makefile
@@ -7,7 +7,7 @@ extra-y := head.o init_task.o vmlinux.lds
 obj-y	:= process.o semaphore.o signal.o entry.o traps.o irq.o vm86.o \
 		ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \
 		pci-dma.o i386_ksyms.o i387.o dmi_scan.o bootflag.o \
-		doublefault.o quirks.o i8237.o
+		quirks.o i8237.o
 
 obj-y				+= cpu/
 obj-y				+= timers/
@@ -33,6 +33,7 @@ obj-y				+= sysenter.o vsyscall.o
 obj-$(CONFIG_ACPI_SRAT) 	+= srat.o
 obj-$(CONFIG_HPET_TIMER) 	+= time_hpet.o
 obj-$(CONFIG_EFI) 		+= efi.o efi_stub.o
+obj-$(CONFIG_DOUBLEFAULT) 	+= doublefault.o
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
 
 EXTRA_AFLAGS   := -traditional
diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c
index cca655688f..170400879f 100644
--- a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -609,8 +609,10 @@ void __devinit cpu_init(void)
 	load_TR_desc();
 	load_LDT(&init_mm.context);
 
+#ifdef CONFIG_DOUBLEFAULT
 	/* Set up doublefault TSS pointer in the GDT */
 	__set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
+#endif
 
 	/* Clear %fs and %gs. */
 	asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs");
diff --git a/init/Kconfig b/init/Kconfig
index 0c9932f9f0..0eb65f2ad8 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -309,6 +309,15 @@ config BUG
           option for embedded systems with no facilities for reporting errors.
           Just say Y.
 
+config DOUBLEFAULT
+	depends X86
+	default y if X86
+	bool "Enable doublefault exception handler" if EMBEDDED
+	help
+          This option allows trapping of rare doublefault exceptions that
+          would otherwise cause a system to silently reboot. Disabling this
+          option saves about 4k.
+
 config BASE_FULL
 	default y
 	bool "Enable full-sized data structures for core" if EMBEDDED
-- 
cgit v1.2.2


From e585e47031751f4e393e10ffd922885508b958dd Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Sun, 8 Jan 2006 01:05:24 -0800
Subject: [PATCH] tiny: Make *[ug]id16 support optional

Configurable 16-bit UID and friends support

This allows turning off the legacy 16 bit UID interfaces on embedded platforms.

   text    data     bss     dec     hex filename
3330172  529036  190556 4049764  3dcb64 vmlinux-baseline
3328268  529040  190556 4047864  3dc3f8 vmlinux

From: Adrian Bunk <bunk@stusta.de>

    UID16 was accidentially disabled for !EMBEDDED.

Signed-off-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/alpha/Kconfig     |  3 ---
 arch/arm/Kconfig       |  4 ----
 arch/arm26/Kconfig     |  4 ----
 arch/cris/Kconfig      |  4 ----
 arch/h8300/Kconfig     |  4 ----
 arch/i386/Kconfig      |  4 ----
 arch/m68k/Kconfig      |  4 ----
 arch/m68knommu/Kconfig |  4 ----
 arch/parisc/Kconfig    |  3 ---
 arch/powerpc/Kconfig   |  3 ---
 arch/ppc/Kconfig       |  3 ---
 arch/sh/Kconfig        |  4 ----
 arch/sparc/Kconfig     |  4 ----
 arch/sparc64/Kconfig   |  5 -----
 arch/um/Kconfig        |  4 ----
 arch/v850/Kconfig      |  3 ---
 arch/x86_64/Kconfig    |  5 -----
 init/Kconfig           |  9 +++++++++
 kernel/sys_ni.c        | 19 +++++++++++++++++++
 19 files changed, 28 insertions(+), 65 deletions(-)

diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index 153337ff1d..eedf41bf70 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -18,9 +18,6 @@ config MMU
 	bool
 	default y
 
-config UID16
-	bool
-
 config RWSEM_GENERIC_SPINLOCK
 	bool
 
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 16a5d522b2..7a74e3e5f9 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -46,10 +46,6 @@ config MCA
 	  <file:Documentation/mca.txt> (and especially the web page given
 	  there) before attempting to build an MCA bus kernel.
 
-config UID16
-	bool
-	default y
-
 config RWSEM_GENERIC_SPINLOCK
 	bool
 	default y
diff --git a/arch/arm26/Kconfig b/arch/arm26/Kconfig
index 1f00b3d03a..274e07019b 100644
--- a/arch/arm26/Kconfig
+++ b/arch/arm26/Kconfig
@@ -34,10 +34,6 @@ config FORCE_MAX_ZONEORDER
         int
         default 9
 
-config UID16
-	bool
-	default y
-
 config RWSEM_GENERIC_SPINLOCK
 	bool
 	default y
diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig
index e5979d68e3..b832619497 100644
--- a/arch/cris/Kconfig
+++ b/arch/cris/Kconfig
@@ -9,10 +9,6 @@ config MMU
 	bool
 	default y
 
-config UID16
-	bool
-	default y
-
 config RWSEM_GENERIC_SPINLOCK
 	bool
 	default y
diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig
index 26698a49f1..80940d712a 100644
--- a/arch/h8300/Kconfig
+++ b/arch/h8300/Kconfig
@@ -21,10 +21,6 @@ config FPU
 	bool
 	default n
 
-config UID16
-	bool
-	default y
-
 config RWSEM_GENERIC_SPINLOCK
 	bool
 	default y
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 810ba8c37a..d849c6870e 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -29,10 +29,6 @@ config MMU
 config SBUS
 	bool
 
-config UID16
-	bool
-	default y
-
 config GENERIC_ISA_DMA
 	bool
 	default y
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index 1dd5d18b22..96b9198280 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -10,10 +10,6 @@ config MMU
 	bool
 	default y
 
-config UID16
-	bool
-	default y
-
 config RWSEM_GENERIC_SPINLOCK
 	bool
 	default y
diff --git a/arch/m68knommu/Kconfig b/arch/m68knommu/Kconfig
index b96498120f..e2a6e86489 100644
--- a/arch/m68knommu/Kconfig
+++ b/arch/m68knommu/Kconfig
@@ -17,10 +17,6 @@ config FPU
 	bool
 	default n
 
-config UID16
-	bool
-	default y
-
 config RWSEM_GENERIC_SPINLOCK
 	bool
 	default y
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 874a283edb..e77a06e962 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -19,9 +19,6 @@ config MMU
 config STACK_GROWSUP
 	def_bool y
 
-config UID16
-	bool
-
 config RWSEM_GENERIC_SPINLOCK
 	def_bool y
 
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index db93dbc0e2..331483ace0 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -26,9 +26,6 @@ config MMU
 	bool
 	default y
 
-config UID16
-	bool
-
 config GENERIC_HARDIRQS
 	bool
 	default y
diff --git a/arch/ppc/Kconfig b/arch/ppc/Kconfig
index cc3f64c084..e396f4591d 100644
--- a/arch/ppc/Kconfig
+++ b/arch/ppc/Kconfig
@@ -8,9 +8,6 @@ config MMU
 	bool
 	default y
 
-config UID16
-	bool
-
 config GENERIC_HARDIRQS
 	bool
 	default y
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 64f5ae0ff9..8cf6d437a6 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -14,10 +14,6 @@ config SUPERH
 	  gaming console.  The SuperH port has a home page at
 	  <http://www.linux-sh.org/>.
 
-config UID16
-	bool
-	default y
-
 config RWSEM_GENERIC_SPINLOCK
 	bool
 	default y
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 56c34e7fd4..f944b58cdf 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -9,10 +9,6 @@ config MMU
 	bool
 	default y
 
-config UID16
-	bool
-	default y
-
 config HIGHMEM
 	bool
 	default y
diff --git a/arch/sparc64/Kconfig b/arch/sparc64/Kconfig
index c4b7ad70cd..b775ceb4cf 100644
--- a/arch/sparc64/Kconfig
+++ b/arch/sparc64/Kconfig
@@ -309,11 +309,6 @@ config COMPAT
 	depends on SPARC32_COMPAT
 	default y
 
-config UID16
-	bool
-	depends on SPARC32_COMPAT
-	default y
-
 config BINFMT_ELF32
 	tristate "Kernel support for 32-bit ELF binaries"
 	depends on SPARC32_COMPAT
diff --git a/arch/um/Kconfig b/arch/um/Kconfig
index cdaa2ab192..b4ff2e5760 100644
--- a/arch/um/Kconfig
+++ b/arch/um/Kconfig
@@ -22,10 +22,6 @@ config SBUS
 config PCI
 	bool
 
-config UID16
-	bool
-	default y
-
 config GENERIC_CALIBRATE_DELAY
 	bool
 	default y
diff --git a/arch/v850/Kconfig b/arch/v850/Kconfig
index 3108659032..04494638b9 100644
--- a/arch/v850/Kconfig
+++ b/arch/v850/Kconfig
@@ -10,9 +10,6 @@ mainmenu "uClinux/v850 (w/o MMU) Kernel Configuration"
 config MMU
        	bool
 	default n
-config UID16
-	bool
-	default n
 config RWSEM_GENERIC_SPINLOCK
 	bool
 	default y
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index 6ece645e4d..4f3e925962 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -542,11 +542,6 @@ config SYSVIPC_COMPAT
 	depends on COMPAT && SYSVIPC
 	default y
 
-config UID16
-	bool
-	depends on IA32_EMULATION
-	default y
-
 endmenu
 
 source "net/Kconfig"
diff --git a/init/Kconfig b/init/Kconfig
index 0eb65f2ad8..1a1f114a37 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -228,6 +228,15 @@ config CPUSETS
 
 source "usr/Kconfig"
 
+config UID16
+	bool "Enable 16-bit UID system calls" if EMBEDDED
+	depends !ALPHA && !PPC && !PPC64 && !PARISC && !V850 && !ARCH_S390X
+	depends !X86_64 || IA32_EMULATION
+	depends !SPARC64 || SPARC32_COMPAT
+	default y
+	help
+	  This enables the legacy 16-bit UID syscall wrappers.
+
 config CC_OPTIMIZE_FOR_SIZE
 	bool "Optimize for size (Look out for broken compilers!)"
 	default y
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 7a8bc7f60d..72cafc922d 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -83,6 +83,25 @@ cond_syscall(sys_inotify_init);
 cond_syscall(sys_inotify_add_watch);
 cond_syscall(sys_inotify_rm_watch);
 cond_syscall(sys_migrate_pages);
+cond_syscall(sys_chown16);
+cond_syscall(sys_fchown16);
+cond_syscall(sys_getegid16);
+cond_syscall(sys_geteuid16);
+cond_syscall(sys_getgid16);
+cond_syscall(sys_getgroups16);
+cond_syscall(sys_getresgid16);
+cond_syscall(sys_getresuid16);
+cond_syscall(sys_getuid16);
+cond_syscall(sys_lchown16);
+cond_syscall(sys_setfsgid16);
+cond_syscall(sys_setfsuid16);
+cond_syscall(sys_setgid16);
+cond_syscall(sys_setgroups16);
+cond_syscall(sys_setregid16);
+cond_syscall(sys_setresgid16);
+cond_syscall(sys_setresuid16);
+cond_syscall(sys_setreuid16);
+cond_syscall(sys_setuid16);
 
 /* arch-specific weak syscall entries */
 cond_syscall(sys_pciconfig_read);
-- 
cgit v1.2.2


From 708e9a794cf8822b760edaccd9053edb07c34d19 Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Sun, 8 Jan 2006 01:05:25 -0800
Subject: [PATCH] tiny: Configure ELF core dump support

configurable support for ELF core dumps

   text    data     bss     dec     hex filename
3330172  529036  190556 4049764  3dcb64 vmlinux-baseline
3325552  528912  190556 4045020  3db8dc vmlinux-no-elf

add/remove: 0/8 grow/shrink: 0/0 up/down: 0/-4424 (-4424)
function                                     old     new   delta
fill_note                                     32       -     -32
maydump                                       58       -     -58
dump_seek                                     67       -     -67
writenote                                    180       -    -180
elf_dump_thread_status                       274       -    -274
fill_psinfo                                  308       -    -308
fill_prstatus                                466       -    -466
elf_core_dump                               3039       -   -3039

Signed-off-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/binfmt_elf.c | 4 ++--
 init/Kconfig    | 6 ++++++
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 288386b1de..80ca932ba0 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -58,7 +58,7 @@ extern int dump_fpu (struct pt_regs *, elf_fpregset_t *);
  * If we don't support core dumping, then supply a NULL so we
  * don't even try.
  */
-#ifdef USE_ELF_CORE_DUMP
+#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
 static int elf_core_dump(long signr, struct pt_regs * regs, struct file * file);
 #else
 #define elf_core_dump	NULL
@@ -1113,7 +1113,7 @@ out:
  * Note that some platforms still use traditional core dumps and not
  * the ELF core dump.  Each platform can select it as appropriate.
  */
-#ifdef USE_ELF_CORE_DUMP
+#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
 
 /*
  * ELF core dumper
diff --git a/init/Kconfig b/init/Kconfig
index 1a1f114a37..9ac522a401 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -327,6 +327,12 @@ config DOUBLEFAULT
           would otherwise cause a system to silently reboot. Disabling this
           option saves about 4k.
 
+config ELF_CORE
+	default y
+	bool "Enable ELF core dumps" if EMBEDDED
+	help
+	  Enable support for generating core dumps. Disabling saves about 4k.
+
 config BASE_FULL
 	default y
 	bool "Enable full-sized data structures for core" if EMBEDDED
-- 
cgit v1.2.2


From 64ca9004b819ab87648dbfc78f3ef49ee491343e Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Sun, 8 Jan 2006 01:05:26 -0800
Subject: [PATCH] Make vm86 support optional

This adds an option to remove vm86 support under CONFIG_EMBEDDED.  Saves
about 5k.

This version eliminates most of the #ifdefs of the previous version and
instead uses function stubs in vm86.h.  Also, release_vm86_irqs is moved
from asm-i386/irq.h to a more appropriate home in vm86.h so that the stubs
can live together.

$ size vmlinux-baseline vmlinux-novm86
   text    data     bss     dec     hex filename
2920821  523232  190652 3634705  377611 vmlinux-baseline
2916268  523100  190492 3629860  376324 vmlinux-novm86

Signed-off-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/Makefile  |  3 ++-
 arch/i386/kernel/entry.S   |  2 ++
 arch/i386/kernel/process.c |  1 +
 include/asm-i386/irq.h     |  2 --
 include/asm-i386/vm86.h    | 20 ++++++++++++++++++++
 init/Kconfig               | 10 ++++++++++
 kernel/sys_ni.c            |  2 ++
 7 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile
index 4f40589e17..be1880bb75 100644
--- a/arch/i386/kernel/Makefile
+++ b/arch/i386/kernel/Makefile
@@ -4,7 +4,7 @@
 
 extra-y := head.o init_task.o vmlinux.lds
 
-obj-y	:= process.o semaphore.o signal.o entry.o traps.o irq.o vm86.o \
+obj-y	:= process.o semaphore.o signal.o entry.o traps.o irq.o \
 		ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \
 		pci-dma.o i386_ksyms.o i387.o dmi_scan.o bootflag.o \
 		quirks.o i8237.o
@@ -34,6 +34,7 @@ obj-$(CONFIG_ACPI_SRAT) 	+= srat.o
 obj-$(CONFIG_HPET_TIMER) 	+= time_hpet.o
 obj-$(CONFIG_EFI) 		+= efi.o efi_stub.o
 obj-$(CONFIG_DOUBLEFAULT) 	+= doublefault.o
+obj-$(CONFIG_VM86)		+= vm86.o
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
 
 EXTRA_AFLAGS   := -traditional
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index 607c060075..4d704724b2 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -323,6 +323,7 @@ work_notifysig:				# deal with pending signals and
 
 	ALIGN
 work_notifysig_v86:
+#ifdef CONFIG_VM86
 	pushl %ecx			# save ti_flags for do_notify_resume
 	call save_v86_state		# %eax contains pt_regs pointer
 	popl %ecx
@@ -330,6 +331,7 @@ work_notifysig_v86:
 	xorl %edx, %edx
 	call do_notify_resume
 	jmp resume_userspace
+#endif
 
 	# perform syscall exit tracing
 	ALIGN
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index 45e7f0ac4b..035928f3f6 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -48,6 +48,7 @@
 #include <asm/processor.h>
 #include <asm/i387.h>
 #include <asm/desc.h>
+#include <asm/vm86.h>
 #ifdef CONFIG_MATH_EMULATION
 #include <asm/math_emu.h>
 #endif
diff --git a/include/asm-i386/irq.h b/include/asm-i386/irq.h
index 270f1986b1..5169d7af45 100644
--- a/include/asm-i386/irq.h
+++ b/include/asm-i386/irq.h
@@ -21,8 +21,6 @@ static __inline__ int irq_canonicalize(int irq)
 	return ((irq == 2) ? 9 : irq);
 }
 
-extern void release_vm86_irqs(struct task_struct *);
-
 #ifdef CONFIG_X86_LOCAL_APIC
 # define ARCH_HAS_NMI_WATCHDOG		/* See include/linux/nmi.h */
 #endif
diff --git a/include/asm-i386/vm86.h b/include/asm-i386/vm86.h
index 40ec82c691..952fd69573 100644
--- a/include/asm-i386/vm86.h
+++ b/include/asm-i386/vm86.h
@@ -16,7 +16,11 @@
 #define IF_MASK		0x00000200
 #define IOPL_MASK	0x00003000
 #define NT_MASK		0x00004000
+#ifdef CONFIG_VM86
 #define VM_MASK		0x00020000
+#else
+#define VM_MASK		0 /* ignored */
+#endif
 #define AC_MASK		0x00040000
 #define VIF_MASK	0x00080000	/* virtual interrupt flag */
 #define VIP_MASK	0x00100000	/* virtual interrupt pending */
@@ -200,9 +204,25 @@ struct kernel_vm86_struct {
  */
 };
 
+#ifdef CONFIG_VM86
+
 void handle_vm86_fault(struct kernel_vm86_regs *, long);
 int handle_vm86_trap(struct kernel_vm86_regs *, long, int);
 
+struct task_struct;
+void release_vm86_irqs(struct task_struct *);
+
+#else
+
+#define handle_vm86_fault(a, b)
+#define release_vm86_irqs(a)
+
+static inline int handle_vm86_trap(struct kernel_vm86_regs *a, long b, int c) {
+	return 0;
+}
+
+#endif /* CONFIG_VM86 */
+
 #endif /* __KERNEL__ */
 
 #endif
diff --git a/init/Kconfig b/init/Kconfig
index 9ac522a401..f8f6929d8f 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -237,6 +237,16 @@ config UID16
 	help
 	  This enables the legacy 16-bit UID syscall wrappers.
 
+config VM86
+	depends X86
+	default y
+	bool "Enable VM86 support" if EMBEDDED
+	help
+          This option is required by programs like DOSEMU to run 16-bit legacy
+	  code on X86 processors. It also may be needed by software like
+          XFree86 to initialize some video cards via BIOS. Disabling this
+          option saves about 6k.
+
 config CC_OPTIMIZE_FOR_SIZE
 	bool "Optimize for size (Look out for broken compilers!)"
 	default y
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 72cafc922d..bd3b9bfcfc 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -102,6 +102,8 @@ cond_syscall(sys_setresgid16);
 cond_syscall(sys_setresuid16);
 cond_syscall(sys_setreuid16);
 cond_syscall(sys_setuid16);
+cond_syscall(sys_vm86old);
+cond_syscall(sys_vm86);
 
 /* arch-specific weak syscall entries */
 cond_syscall(sys_pciconfig_read);
-- 
cgit v1.2.2