SCSI: support for allocating large scatterlists

This is what enables large commands. If we need to allocate an sgtable that doesn't fit in a single page, allocate several SCSI_MAX_SG_SEGMENTS sized tables and chain them together. SCSI defaults to large chained sg tables, if the arch supports it. Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
author: Jens Axboe <jens.axboe@oracle.com> 2007-08-07 03:02:51 -0400
committer: Jens Axboe <jens.axboe@oracle.com> 2007-10-16 05:12:53 -0400
commit: a8474ce23a73185dd2bae4c884b1716474032d31 (patch)
tree: 63501846b8aaef02579a868f6d5118b6a07c4a5e /drivers/scsi/scsi_lib.c
parent: 0cde8d9510e242c73b2d68f9949cd3c456c863b4 (diff)
1 files changed, 162 insertions, 47 deletions
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 7e1dcc57f115..c75cb6ad6d94 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -36,33 +36,19 @@
 struct scsi_host_sg_pool {
        size_t          size;
-        char            *name; 
+        char            *name;
        struct kmem_cache       *slab;
        mempool_t       *pool;
 };
-#if (SCSI_MAX_PHYS_SEGMENTS < 32)
+#define SP(x) { x, "sgpool-" #x }
-#error SCSI_MAX_PHYS_SEGMENTS is too small
-#endif
-#define SP(x) { x, "sgpool-" #x } 
 static struct scsi_host_sg_pool scsi_sg_pools[] = {
        SP(8),
        SP(16),
        SP(32),
-#if (SCSI_MAX_PHYS_SEGMENTS > 32)
        SP(64),
-#if (SCSI_MAX_PHYS_SEGMENTS > 64)
        SP(128),
-#if (SCSI_MAX_PHYS_SEGMENTS > 128)
+};
-        SP(256),
-#if (SCSI_MAX_PHYS_SEGMENTS > 256)
-#error SCSI_MAX_PHYS_SEGMENTS is too large
-#endif
-#endif
-#endif
-#endif
-};      
 #undef SP
 static void scsi_run_queue(struct request_queue *q);
@@ -698,45 +684,126 @@ static struct scsi_cmnd *scsi_end_request(struct scsi_cmnd *cmd, int uptodate,
        return NULL;
 }
-struct scatterlist *scsi_alloc_sgtable(struct scsi_cmnd *cmd, gfp_t gfp_mask)
+/*
-{
+ * The maximum number of SG segments that we will put inside a scatterlist
-        struct scsi_host_sg_pool *sgp;
+ * (unless chaining is used). Should ideally fit inside a single page, to
-        struct scatterlist *sgl;
+ * avoid a higher order allocation.
+ */
+#define SCSI_MAX_SG_SEGMENTS    128
-        BUG_ON(!cmd->use_sg);
+/*
+ * Like SCSI_MAX_SG_SEGMENTS, but for archs that have sg chaining. This limit
+ * is totally arbitrary, a setting of 2048 will get you at least 8mb ios.
+ */
+#define SCSI_MAX_SG_CHAIN_SEGMENTS      2048
-        switch (cmd->use_sg) {
+static inline unsigned int scsi_sgtable_index(unsigned short nents)
+{
+        unsigned int index;
+        switch (nents) {
        case 1 ... 8:
-                cmd->sglist_len = 0;
+                index = 0;
                break;
        case 9 ... 16:
-                cmd->sglist_len = 1;
+                index = 1;
                break;
        case 17 ... 32:
-                cmd->sglist_len = 2;
+                index = 2;
                break;
-#if (SCSI_MAX_PHYS_SEGMENTS > 32)
        case 33 ... 64:
-                cmd->sglist_len = 3;
+                index = 3;
                break;
-#if (SCSI_MAX_PHYS_SEGMENTS > 64)
+        case 65 ... SCSI_MAX_SG_SEGMENTS:
-        case 65 ... 128:
+                index = 4;
-                cmd->sglist_len = 4;
                break;
-#if (SCSI_MAX_PHYS_SEGMENTS  > 128)
-        case 129 ... 256:
-                cmd->sglist_len = 5;
-                break;
-#endif
-#endif
-#endif
        default:
-                return NULL;
+                printk(KERN_ERR "scsi: bad segment count=%d\n", nents);
+                BUG();
        }
-        sgp = scsi_sg_pools + cmd->sglist_len;
+        return index;
-        sgl = mempool_alloc(sgp->pool, gfp_mask);
+}
-        return sgl;
+struct scatterlist *scsi_alloc_sgtable(struct scsi_cmnd *cmd, gfp_t gfp_mask)
+{
+        struct scsi_host_sg_pool *sgp;
+        struct scatterlist *sgl, *prev, *ret;
+        unsigned int index;
+        int this, left;
+        BUG_ON(!cmd->use_sg);
+        left = cmd->use_sg;
+        ret = prev = NULL;
+        do {
+                this = left;
+                if (this > SCSI_MAX_SG_SEGMENTS) {
+                        this = SCSI_MAX_SG_SEGMENTS - 1;
+                        index = SG_MEMPOOL_NR - 1;
+                } else
+                        index = scsi_sgtable_index(this);
+                left -= this;
+                sgp = scsi_sg_pools + index;
+                sgl = mempool_alloc(sgp->pool, gfp_mask);
+                if (unlikely(!sgl))
+                        goto enomem;
+                memset(sgl, 0, sizeof(*sgl) * sgp->size);
+                /*
+                 * first loop through, set initial index and return value
+                 */
+                if (!ret) {
+                        cmd->sglist_len = index;
+                        ret = sgl;
+                }
+                /*
+                 * chain previous sglist, if any. we know the previous
+                 * sglist must be the biggest one, or we would not have
+                 * ended up doing another loop.
+                 */
+                if (prev)
+                        sg_chain(prev, SCSI_MAX_SG_SEGMENTS, sgl);
+                /*
+                 * don't allow subsequent mempool allocs to sleep, it would
+                 * violate the mempool principle.
+                 */
+                gfp_mask &= ~__GFP_WAIT;
+                gfp_mask |= __GFP_HIGH;
+                prev = sgl;
+        } while (left);
+        /*
+         * ->use_sg may get modified after dma mapping has potentially
+         * shrunk the number of segments, so keep a copy of it for free.
+         */
+        cmd->__use_sg = cmd->use_sg;
+        return ret;
+enomem:
+        if (ret) {
+                /*
+                 * Free entries chained off ret. Since we were trying to
+                 * allocate another sglist, we know that all entries are of
+                 * the max size.
+                 */
+                sgp = scsi_sg_pools + SG_MEMPOOL_NR - 1;
+                prev = ret;
+                ret = &ret[SCSI_MAX_SG_SEGMENTS - 1];
+                while ((sgl = sg_chain_ptr(ret)) != NULL) {
+                        ret = &sgl[SCSI_MAX_SG_SEGMENTS - 1];
+                        mempool_free(sgl, sgp->pool);
+                }
+                mempool_free(prev, sgp->pool);
+        }
+        return NULL;
 }
 EXPORT_SYMBOL(scsi_alloc_sgtable);
@@ -748,6 +815,42 @@ void scsi_free_sgtable(struct scsi_cmnd *cmd)
        BUG_ON(cmd->sglist_len >= SG_MEMPOOL_NR);
+        /*
+         * if this is the biggest size sglist, check if we have
+         * chained parts we need to free
+         */
+        if (cmd->__use_sg > SCSI_MAX_SG_SEGMENTS) {
+                unsigned short this, left;
+                struct scatterlist *next;
+                unsigned int index;
+                left = cmd->__use_sg - (SCSI_MAX_SG_SEGMENTS - 1);
+                next = sg_chain_ptr(&sgl[SCSI_MAX_SG_SEGMENTS - 1]);
+                while (left && next) {
+                        sgl = next;
+                        this = left;
+                        if (this > SCSI_MAX_SG_SEGMENTS) {
+                                this = SCSI_MAX_SG_SEGMENTS - 1;
+                                index = SG_MEMPOOL_NR - 1;
+                        } else
+                                index = scsi_sgtable_index(this);
+                        left -= this;
+                        sgp = scsi_sg_pools + index;
+                        if (left)
+                                next = sg_chain_ptr(&sgl[sgp->size - 1]);
+                        mempool_free(sgl, sgp->pool);
+                }
+                /*
+                 * Restore original, will be freed below
+                 */
+                sgl = cmd->request_buffer;
+        }
        sgp = scsi_sg_pools + cmd->sglist_len;
        mempool_free(sgl, sgp->pool);
 }
@@ -988,7 +1091,6 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 static int scsi_init_io(struct scsi_cmnd *cmd)
 {
        struct request     *req = cmd->request;
-        struct scatterlist *sgpnt;
        int                count;
        /*
@@ -1001,14 +1103,13 @@ static int scsi_init_io(struct scsi_cmnd *cmd)
        /*
         * If sg table allocation fails, requeue request later.
         */
-        sgpnt = scsi_alloc_sgtable(cmd, GFP_ATOMIC);
+        cmd->request_buffer = scsi_alloc_sgtable(cmd, GFP_ATOMIC);
-        if (unlikely(!sgpnt)) {
+        if (unlikely(!cmd->request_buffer)) {
                scsi_unprep_request(req);
                return BLKPREP_DEFER;
        }
        req->buffer = NULL;
-        cmd->request_buffer = (char *) sgpnt;
        if (blk_pc_request(req))
                cmd->request_bufflen = req->data_len;
        else
@@ -1533,8 +1634,22 @@ struct request_queue *__scsi_alloc_queue(struct Scsi_Host *shost,
        if (!q)
                return NULL;
+        /*
+         * this limit is imposed by hardware restrictions
+         */
        blk_queue_max_hw_segments(q, shost->sg_tablesize);
-        blk_queue_max_phys_segments(q, SCSI_MAX_PHYS_SEGMENTS);
+        /*
+         * In the future, sg chaining support will be mandatory and this
+         * ifdef can then go away. Right now we don't have all archs
+         * converted, so better keep it safe.
+         */
+#ifdef ARCH_HAS_SG_CHAIN
+        blk_queue_max_phys_segments(q, SCSI_MAX_SG_CHAIN_SEGMENTS);
+#else
+        blk_queue_max_phys_segments(q, SCSI_MAX_SG_SEGMENTS);
+#endif
        blk_queue_max_sectors(q, shost->max_sectors);
        blk_queue_bounce_limit(q, scsi_calculate_bounce_limit(shost));
        blk_queue_segment_boundary(q, shost->dma_boundary);
author	Jens Axboe <jens.axboe@oracle.com>	2007-08-07 03:02:51 -0400
committer	Jens Axboe <jens.axboe@oracle.com>	2007-10-16 05:12:53 -0400
commit	a8474ce23a73185dd2bae4c884b1716474032d31 (patch)
tree	63501846b8aaef02579a868f6d5118b6a07c4a5e /drivers/scsi/scsi_lib.c
parent	0cde8d9510e242c73b2d68f9949cd3c456c863b4 (diff)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 7e1dcc57f115..c75cb6ad6d94 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c
@@ -36,33 +36,19 @@
36		36
37	struct scsi_host_sg_pool {	37	struct scsi_host_sg_pool {
38	size_t size;	38	size_t size;
39	char *name;	39	char *name;
40	struct kmem_cache *slab;	40	struct kmem_cache *slab;
41	mempool_t *pool;	41	mempool_t *pool;
42	};	42	};
43		43
44	#if (SCSI_MAX_PHYS_SEGMENTS < 32)	44	#define SP(x) { x, "sgpool-" #x }
45	#error SCSI_MAX_PHYS_SEGMENTS is too small
46	#endif
47
48	#define SP(x) { x, "sgpool-" #x }
49	static struct scsi_host_sg_pool scsi_sg_pools[] = {	45	static struct scsi_host_sg_pool scsi_sg_pools[] = {
50	SP(8),	46	SP(8),
51	SP(16),	47	SP(16),
52	SP(32),	48	SP(32),
53	#if (SCSI_MAX_PHYS_SEGMENTS > 32)
54	SP(64),	49	SP(64),
55	#if (SCSI_MAX_PHYS_SEGMENTS > 64)
56	SP(128),	50	SP(128),
57	#if (SCSI_MAX_PHYS_SEGMENTS > 128)	51	};
58	SP(256),
59	#if (SCSI_MAX_PHYS_SEGMENTS > 256)
60	#error SCSI_MAX_PHYS_SEGMENTS is too large
61	#endif
62	#endif
63	#endif
64	#endif
65	};
66	#undef SP	52	#undef SP
67		53
68	static void scsi_run_queue(struct request_queue *q);	54	static void scsi_run_queue(struct request_queue *q);
@@ -698,45 +684,126 @@ static struct scsi_cmnd scsi_end_request(struct scsi_cmnd cmd, int uptodate,
698	return NULL;	684	return NULL;
699	}	685	}
700		686
701	struct scatterlist scsi_alloc_sgtable(struct scsi_cmnd cmd, gfp_t gfp_mask)	687	/*
702	{	688	* The maximum number of SG segments that we will put inside a scatterlist
703	struct scsi_host_sg_pool *sgp;	689	* (unless chaining is used). Should ideally fit inside a single page, to
704	struct scatterlist *sgl;	690	* avoid a higher order allocation.
		691	*/
		692	#define SCSI_MAX_SG_SEGMENTS 128
705		693
706	BUG_ON(!cmd->use_sg);	694	/*
		695	* Like SCSI_MAX_SG_SEGMENTS, but for archs that have sg chaining. This limit
		696	* is totally arbitrary, a setting of 2048 will get you at least 8mb ios.
		697	*/
		698	#define SCSI_MAX_SG_CHAIN_SEGMENTS 2048
707		699
708	switch (cmd->use_sg) {	700	static inline unsigned int scsi_sgtable_index(unsigned short nents)
		701	{
		702	unsigned int index;
		703
		704	switch (nents) {
709	case 1 ... 8:	705	case 1 ... 8:
710	cmd->sglist_len = 0;	706	index = 0;
711	break;	707	break;
712	case 9 ... 16:	708	case 9 ... 16:
713	cmd->sglist_len = 1;	709	index = 1;
714	break;	710	break;
715	case 17 ... 32:	711	case 17 ... 32:
716	cmd->sglist_len = 2;	712	index = 2;
717	break;	713	break;
718	#if (SCSI_MAX_PHYS_SEGMENTS > 32)
719	case 33 ... 64:	714	case 33 ... 64:
720	cmd->sglist_len = 3;	715	index = 3;
721	break;	716	break;
722	#if (SCSI_MAX_PHYS_SEGMENTS > 64)	717	case 65 ... SCSI_MAX_SG_SEGMENTS:
723	case 65 ... 128:	718	index = 4;
724	cmd->sglist_len = 4;
725	break;	719	break;
726	#if (SCSI_MAX_PHYS_SEGMENTS > 128)
727	case 129 ... 256:
728	cmd->sglist_len = 5;
729	break;
730	#endif
731	#endif
732	#endif
733	default:	720	default:
734	return NULL;	721	printk(KERN_ERR "scsi: bad segment count=%d\n", nents);
		722	BUG();
735	}	723	}
736		724
737	sgp = scsi_sg_pools + cmd->sglist_len;	725	return index;
738	sgl = mempool_alloc(sgp->pool, gfp_mask);	726	}
739	return sgl;	727
		728	struct scatterlist scsi_alloc_sgtable(struct scsi_cmnd cmd, gfp_t gfp_mask)
		729	{
		730	struct scsi_host_sg_pool *sgp;
		731	struct scatterlist sgl, prev, *ret;
		732	unsigned int index;
		733	int this, left;
		734
		735	BUG_ON(!cmd->use_sg);
		736
		737	left = cmd->use_sg;
		738	ret = prev = NULL;
		739	do {
		740	this = left;
		741	if (this > SCSI_MAX_SG_SEGMENTS) {
		742	this = SCSI_MAX_SG_SEGMENTS - 1;
		743	index = SG_MEMPOOL_NR - 1;
		744	} else
		745	index = scsi_sgtable_index(this);
		746
		747	left -= this;
		748
		749	sgp = scsi_sg_pools + index;
		750
		751	sgl = mempool_alloc(sgp->pool, gfp_mask);
		752	if (unlikely(!sgl))
		753	goto enomem;
		754
		755	memset(sgl, 0, sizeof(sgl) sgp->size);
		756
		757	/*
		758	* first loop through, set initial index and return value
		759	*/
		760	if (!ret) {
		761	cmd->sglist_len = index;
		762	ret = sgl;
		763	}
		764
		765	/*
		766	* chain previous sglist, if any. we know the previous
		767	* sglist must be the biggest one, or we would not have
		768	* ended up doing another loop.
		769	*/
		770	if (prev)
		771	sg_chain(prev, SCSI_MAX_SG_SEGMENTS, sgl);
		772
		773	/*
		774	* don't allow subsequent mempool allocs to sleep, it would
		775	* violate the mempool principle.
		776	*/
		777	gfp_mask &= ~__GFP_WAIT;
		778	gfp_mask \|= __GFP_HIGH;
		779	prev = sgl;
		780	} while (left);
		781
		782	/*
		783	* ->use_sg may get modified after dma mapping has potentially
		784	* shrunk the number of segments, so keep a copy of it for free.
		785	*/
		786	cmd->__use_sg = cmd->use_sg;
		787	return ret;
		788	enomem:
		789	if (ret) {
		790	/*
		791	* Free entries chained off ret. Since we were trying to
		792	* allocate another sglist, we know that all entries are of
		793	* the max size.
		794	*/
		795	sgp = scsi_sg_pools + SG_MEMPOOL_NR - 1;
		796	prev = ret;
		797	ret = &ret[SCSI_MAX_SG_SEGMENTS - 1];
		798
		799	while ((sgl = sg_chain_ptr(ret)) != NULL) {
		800	ret = &sgl[SCSI_MAX_SG_SEGMENTS - 1];
		801	mempool_free(sgl, sgp->pool);
		802	}
		803
		804	mempool_free(prev, sgp->pool);
		805	}
		806	return NULL;
740	}	807	}
741		808
742	EXPORT_SYMBOL(scsi_alloc_sgtable);	809	EXPORT_SYMBOL(scsi_alloc_sgtable);
@@ -748,6 +815,42 @@ void scsi_free_sgtable(struct scsi_cmnd *cmd)
748		815
749	BUG_ON(cmd->sglist_len >= SG_MEMPOOL_NR);	816	BUG_ON(cmd->sglist_len >= SG_MEMPOOL_NR);
750		817
		818	/*
		819	* if this is the biggest size sglist, check if we have
		820	* chained parts we need to free
		821	*/
		822	if (cmd->__use_sg > SCSI_MAX_SG_SEGMENTS) {
		823	unsigned short this, left;
		824	struct scatterlist *next;
		825	unsigned int index;
		826
		827	left = cmd->__use_sg - (SCSI_MAX_SG_SEGMENTS - 1);
		828	next = sg_chain_ptr(&sgl[SCSI_MAX_SG_SEGMENTS - 1]);
		829	while (left && next) {
		830	sgl = next;
		831	this = left;
		832	if (this > SCSI_MAX_SG_SEGMENTS) {
		833	this = SCSI_MAX_SG_SEGMENTS - 1;
		834	index = SG_MEMPOOL_NR - 1;
		835	} else
		836	index = scsi_sgtable_index(this);
		837
		838	left -= this;
		839
		840	sgp = scsi_sg_pools + index;
		841
		842	if (left)
		843	next = sg_chain_ptr(&sgl[sgp->size - 1]);
		844
		845	mempool_free(sgl, sgp->pool);
		846	}
		847
		848	/*
		849	* Restore original, will be freed below
		850	*/
		851	sgl = cmd->request_buffer;
		852	}
		853
751	sgp = scsi_sg_pools + cmd->sglist_len;	854	sgp = scsi_sg_pools + cmd->sglist_len;
752	mempool_free(sgl, sgp->pool);	855	mempool_free(sgl, sgp->pool);
753	}	856	}
@@ -988,7 +1091,6 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
988	static int scsi_init_io(struct scsi_cmnd *cmd)	1091	static int scsi_init_io(struct scsi_cmnd *cmd)
989	{	1092	{
990	struct request *req = cmd->request;	1093	struct request *req = cmd->request;
991	struct scatterlist *sgpnt;
992	int count;	1094	int count;
993		1095
994	/*	1096	/*
@@ -1001,14 +1103,13 @@ static int scsi_init_io(struct scsi_cmnd *cmd)
1001	/*	1103	/*
1002	* If sg table allocation fails, requeue request later.	1104	* If sg table allocation fails, requeue request later.
1003	*/	1105	*/
1004	sgpnt = scsi_alloc_sgtable(cmd, GFP_ATOMIC);	1106	cmd->request_buffer = scsi_alloc_sgtable(cmd, GFP_ATOMIC);
1005	if (unlikely(!sgpnt)) {	1107	if (unlikely(!cmd->request_buffer)) {
1006	scsi_unprep_request(req);	1108	scsi_unprep_request(req);
1007	return BLKPREP_DEFER;	1109	return BLKPREP_DEFER;
1008	}	1110	}
1009		1111
1010	req->buffer = NULL;	1112	req->buffer = NULL;
1011	cmd->request_buffer = (char *) sgpnt;
1012	if (blk_pc_request(req))	1113	if (blk_pc_request(req))
1013	cmd->request_bufflen = req->data_len;	1114	cmd->request_bufflen = req->data_len;
1014	else	1115	else
@@ -1533,8 +1634,22 @@ struct request_queue __scsi_alloc_queue(struct Scsi_Host shost,
1533	if (!q)	1634	if (!q)
1534	return NULL;	1635	return NULL;
1535		1636
		1637	/*
		1638	* this limit is imposed by hardware restrictions
		1639	*/
1536	blk_queue_max_hw_segments(q, shost->sg_tablesize);	1640	blk_queue_max_hw_segments(q, shost->sg_tablesize);
1537	blk_queue_max_phys_segments(q, SCSI_MAX_PHYS_SEGMENTS);	1641
		1642	/*
		1643	* In the future, sg chaining support will be mandatory and this
		1644	* ifdef can then go away. Right now we don't have all archs
		1645	* converted, so better keep it safe.
		1646	*/
		1647	#ifdef ARCH_HAS_SG_CHAIN
		1648	blk_queue_max_phys_segments(q, SCSI_MAX_SG_CHAIN_SEGMENTS);
		1649	#else
		1650	blk_queue_max_phys_segments(q, SCSI_MAX_SG_SEGMENTS);
		1651	#endif
		1652
1538	blk_queue_max_sectors(q, shost->max_sectors);	1653	blk_queue_max_sectors(q, shost->max_sectors);
1539	blk_queue_bounce_limit(q, scsi_calculate_bounce_limit(shost));	1654	blk_queue_bounce_limit(q, scsi_calculate_bounce_limit(shost));
1540	blk_queue_segment_boundary(q, shost->dma_boundary);	1655	blk_queue_segment_boundary(q, shost->dma_boundary);