aboutsummaryrefslogtreecommitdiffstats
path: root/arch/sparc/kernel/psycho_common.c
blob: fe2af66bb1988ee3eff896223b5f131e06ca06e6 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
/* psycho_common.c: Code common to PSYCHO and derivative PCI controllers.
 *
 * Copyright (C) 2008 David S. Miller <davem@davemloft.net>
 */
#include <linux/kernel.h>
#include <linux/interrupt.h>

#include <asm/upa.h>

#include "pci_impl.h"
#include "iommu_common.h"
#include "psycho_common.h"

#define  PSYCHO_STRBUF_CTRL_DENAB	0x0000000000000002ULL
#define  PSYCHO_STCERR_WRITE		0x0000000000000002ULL
#define  PSYCHO_STCERR_READ		0x0000000000000001ULL
#define  PSYCHO_STCTAG_PPN		0x0fffffff00000000ULL
#define  PSYCHO_STCTAG_VPN		0x00000000ffffe000ULL
#define  PSYCHO_STCTAG_VALID		0x0000000000000002ULL
#define  PSYCHO_STCTAG_WRITE		0x0000000000000001ULL
#define  PSYCHO_STCLINE_LINDX		0x0000000001e00000ULL
#define  PSYCHO_STCLINE_SPTR		0x00000000001f8000ULL
#define  PSYCHO_STCLINE_LADDR		0x0000000000007f00ULL
#define  PSYCHO_STCLINE_EPTR		0x00000000000000fcULL
#define  PSYCHO_STCLINE_VALID		0x0000000000000002ULL
#define  PSYCHO_STCLINE_FOFN		0x0000000000000001ULL

static DEFINE_SPINLOCK(stc_buf_lock);
static unsigned long stc_error_buf[128];
static unsigned long stc_tag_buf[16];
static unsigned long stc_line_buf[16];

static void psycho_check_stc_error(struct pci_pbm_info *pbm)
{
	unsigned long err_base, tag_base, line_base;
	struct strbuf *strbuf = &pbm->stc;
	u64 control;
	int i;

	if (!strbuf->strbuf_control)
		return;

	err_base = strbuf->strbuf_err_stat;
	tag_base = strbuf->strbuf_tag_diag;
	line_base = strbuf->strbuf_line_diag;

	spin_lock(&stc_buf_lock);

	/* This is __REALLY__ dangerous.  When we put the streaming
	 * buffer into diagnostic mode to probe it's tags and error
	 * status, we _must_ clear all of the line tag valid bits
	 * before re-enabling the streaming buffer.  If any dirty data
	 * lives in the STC when we do this, we will end up
	 * invalidating it before it has a chance to reach main
	 * memory.
	 */
	control = upa_readq(strbuf->strbuf_control);
	upa_writeq(control | PSYCHO_STRBUF_CTRL_DENAB, strbuf->strbuf_control);
	for (i = 0; i < 128; i++) {
		u64 val;

		val = upa_readq(err_base + (i * 8UL));
		upa_writeq(0UL, err_base + (i * 8UL));
		stc_error_buf[i] = val;
	}
	for (i = 0; i < 16; i++) {
		stc_tag_buf[i] = upa_readq(tag_base + (i * 8UL));
		stc_line_buf[i] = upa_readq(line_base + (i * 8UL));
		upa_writeq(0UL, tag_base + (i * 8UL));
		upa_writeq(0UL, line_base + (i * 8UL));
	}

	/* OK, state is logged, exit diagnostic mode. */
	upa_writeq(control, strbuf->strbuf_control);

	for (i = 0; i < 16; i++) {
		int j, saw_error, first, last;

		saw_error = 0;
		first = i * 8;
		last = first + 8;
		for (j = first; j < last; j++) {
			u64 errval = stc_error_buf[j];
			if (errval != 0) {
				saw_error++;
				printk(KERN_ERR "%s: STC_ERR(%d)[wr(%d)"
				       "rd(%d)]\n",
				       pbm->name,
				       j,
				       (errval & PSYCHO_STCERR_WRITE) ? 1 : 0,
				       (errval & PSYCHO_STCERR_READ) ? 1 : 0);
			}
		}
		if (saw_error != 0) {
			u64 tagval = stc_tag_buf[i];
			u64 lineval = stc_line_buf[i];
			printk(KERN_ERR "%s: STC_TAG(%d)[PA(%016llx)VA(%08llx)"
			       "V(%d)W(%d)]\n",
			       pbm->name,
			       i,
			       ((tagval & PSYCHO_STCTAG_PPN) >> 19UL),
			       (tagval & PSYCHO_STCTAG_VPN),
			       ((tagval & PSYCHO_STCTAG_VALID) ? 1 : 0),
			       ((tagval & PSYCHO_STCTAG_WRITE) ? 1 : 0));
			printk(KERN_ERR "%s: STC_LINE(%d)[LIDX(%llx)SP(%llx)"
			       "LADDR(%llx)EP(%llx)V(%d)FOFN(%d)]\n",
			       pbm->name,
			       i,
			       ((lineval & PSYCHO_STCLINE_LINDX) >> 21UL),
			       ((lineval & PSYCHO_STCLINE_SPTR) >> 15UL),
			       ((lineval & PSYCHO_STCLINE_LADDR) >> 8UL),
			       ((lineval & PSYCHO_STCLINE_EPTR) >> 2UL),
			       ((lineval & PSYCHO_STCLINE_VALID) ? 1 : 0),
			       ((lineval & PSYCHO_STCLINE_FOFN) ? 1 : 0));
		}
	}

	spin_unlock(&stc_buf_lock);
}

#define PSYCHO_IOMMU_TAG		0xa580UL
#define PSYCHO_IOMMU_DATA		0xa600UL

static void psycho_record_iommu_tags_and_data(struct pci_pbm_info *pbm,
					      u64 *tag, u64 *data)
{
	int i;

	for (i = 0; i < 16; i++) {
		unsigned long base = pbm->controller_regs;
		unsigned long off = i * 8UL;

		tag[i] = upa_readq(base + PSYCHO_IOMMU_TAG+off);
		data[i] = upa_readq(base + PSYCHO_IOMMU_DATA+off);

		/* Now clear out the entry. */
		upa_writeq(0, base + PSYCHO_IOMMU_TAG + off);
		upa_writeq(0, base + PSYCHO_IOMMU_DATA + off);
	}
}

#define  PSYCHO_IOMMU_TAG_ERRSTS (0x3UL << 23UL)
#define  PSYCHO_IOMMU_TAG_ERR	 (0x1UL << 22UL)
#define  PSYCHO_IOMMU_TAG_WRITE	 (0x1UL << 21UL)
#define  PSYCHO_IOMMU_TAG_STREAM (0x1UL << 20UL)
#define  PSYCHO_IOMMU_TAG_SIZE	 (0x1UL << 19UL)
#define  PSYCHO_IOMMU_TAG_VPAGE	 0x7ffffULL
#define  PSYCHO_IOMMU_DATA_VALID (1UL << 30UL)
#define  PSYCHO_IOMMU_DATA_CACHE (1UL << 28UL)
#define  PSYCHO_IOMMU_DATA_PPAGE 0xfffffffULL

static void psycho_dump_iommu_tags_and_data(struct pci_pbm_info *pbm,
					    u64 *tag, u64 *data)
{
	int i;

	for (i = 0; i < 16; i++) {
		u64 tag_val, data_val;
		const char *type_str;
		tag_val = tag[i];
		if (!(tag_val & PSYCHO_IOMMU_TAG_ERR))
			continue;

		data_val = data[i];
		switch((tag_val & PSYCHO_IOMMU_TAG_ERRSTS) >> 23UL) {
		case 0:
			type_str = "Protection Error";
			break;
		case 1:
			type_str = "Invalid Error";
			break;
		case 2:
			type_str = "TimeOut Error";
			break;
		case 3:
		default:
			type_str = "ECC Error";
			break;
		}

		printk(KERN_ERR "%s: IOMMU TAG(%d)[error(%s) wr(%d) "
		       "str(%d) sz(%dK) vpg(%08llx)]\n",
		       pbm->name, i, type_str,
		       ((tag_val & PSYCHO_IOMMU_TAG_WRITE) ? 1 : 0),
		       ((tag_val & PSYCHO_IOMMU_TAG_STREAM) ? 1 : 0),
		       ((tag_val & PSYCHO_IOMMU_TAG_SIZE) ? 64 : 8),
		       (tag_val & PSYCHO_IOMMU_TAG_VPAGE) << IOMMU_PAGE_SHIFT);
		printk(KERN_ERR "%s: IOMMU DATA(%d)[valid(%d) cache(%d) "
		       "ppg(%016llx)]\n",
		       pbm->name, i,
		       ((data_val & PSYCHO_IOMMU_DATA_VALID) ? 1 : 0),
		       ((data_val & PSYCHO_IOMMU_DATA_CACHE) ? 1 : 0),
		       (data_val & PSYCHO_IOMMU_DATA_PPAGE) << IOMMU_PAGE_SHIFT);
	}
}

#define  PSYCHO_IOMMU_CTRL_XLTESTAT	0x0000000006000000UL
#define  PSYCHO_IOMMU_CTRL_XLTEERR	0x0000000001000000UL

void psycho_check_iommu_error(struct pci_pbm_info *pbm,
			      unsigned long afsr,
			      unsigned long afar,
			      enum psycho_error_type type)
{
	u64 control, iommu_tag[16], iommu_data[16];
	struct iommu *iommu = pbm->iommu;
	unsigned long flags;

	spin_lock_irqsave(&iommu->lock, flags);
	control = upa_readq(iommu->iommu_control);
	if (control & PSYCHO_IOMMU_CTRL_XLTEERR) {
		const char *type_str;

		control &= ~PSYCHO_IOMMU_CTRL_XLTEERR;
		upa_writeq(control, iommu->iommu_control);

		switch ((control & PSYCHO_IOMMU_CTRL_XLTESTAT) >> 25UL) {
		case 0:
			type_str = "Protection Error";
			break;
		case 1:
			type_str = "Invalid Error";
			break;
		case 2:
			type_str = "TimeOut Error";
			break;
		case 3:
		default:
			type_str = "ECC Error";
			break;
		};
		printk(KERN_ERR "%s: IOMMU Error, type[%s]\n",
		       pbm->name, type_str);

		/* It is very possible for another DVMA to occur while
		 * we do this probe, and corrupt the system further.
		 * But we are so screwed at this point that we are
		 * likely to crash hard anyways, so get as much
		 * diagnostic information to the console as we can.
		 */
		psycho_record_iommu_tags_and_data(pbm, iommu_tag, iommu_data);
		psycho_dump_iommu_tags_and_data(pbm, iommu_tag, iommu_data);
	}
	psycho_check_stc_error(pbm);
	spin_unlock_irqrestore(&iommu->lock, flags);
}

#define  PSYCHO_PCICTRL_SBH_ERR	 0x0000000800000000UL
#define  PSYCHO_PCICTRL_SERR	 0x0000000400000000UL

static irqreturn_t psycho_pcierr_intr_other(struct pci_pbm_info *pbm)
{
	irqreturn_t ret = IRQ_NONE;
	u64 csr, csr_error_bits;
	u16 stat, *addr;

	csr = upa_readq(pbm->pci_csr);
	csr_error_bits = csr & (PSYCHO_PCICTRL_SBH_ERR | PSYCHO_PCICTRL_SERR);
	if (csr_error_bits) {
		/* Clear the errors.  */
		upa_writeq(csr, pbm->pci_csr);

		/* Log 'em.  */
		if (csr_error_bits & PSYCHO_PCICTRL_SBH_ERR)
			printk(KERN_ERR "%s: PCI streaming byte hole "
			       "error asserted.\n", pbm->name);
		if (csr_error_bits & PSYCHO_PCICTRL_SERR)
			printk(KERN_ERR "%s: PCI SERR signal asserted.\n",
			       pbm->name);
		ret = IRQ_HANDLED;
	}
	addr = psycho_pci_config_mkaddr(pbm, pbm->pci_first_busno,
					0, PCI_STATUS);
	pci_config_read16(addr, &stat);
	if (stat & (PCI_STATUS_PARITY |
		    PCI_STATUS_SIG_TARGET_ABORT |
		    PCI_STATUS_REC_TARGET_ABORT |
		    PCI_STATUS_REC_MASTER_ABORT |
		    PCI_STATUS_SIG_SYSTEM_ERROR)) {
		printk(KERN_ERR "%s: PCI bus error, PCI_STATUS[%04x]\n",
		       pbm->name, stat);
		pci_config_write16(addr, 0xffff);
		ret = IRQ_HANDLED;
	}
	return ret;
}

#define  PSYCHO_PCIAFSR_PMA	0x8000000000000000ULL
#define  PSYCHO_PCIAFSR_PTA	0x4000000000000000ULL
#define  PSYCHO_PCIAFSR_PRTRY	0x2000000000000000ULL
#define  PSYCHO_PCIAFSR_PPERR	0x1000000000000000ULL
#define  PSYCHO_PCIAFSR_SMA	0x0800000000000000ULL
#define  PSYCHO_PCIAFSR_STA	0x0400000000000000ULL
#define  PSYCHO_PCIAFSR_SRTRY	0x0200000000000000ULL
#define  PSYCHO_PCIAFSR_SPERR	0x0100000000000000ULL
#define  PSYCHO_PCIAFSR_RESV1	0x00ff000000000000ULL
#define  PSYCHO_PCIAFSR_BMSK	0x0000ffff00000000ULL
#define  PSYCHO_PCIAFSR_BLK	0x0000000080000000ULL
#define  PSYCHO_PCIAFSR_RESV2	0x0000000040000000ULL
#define  PSYCHO_PCIAFSR_MID	0x000000003e000000ULL
#define  PSYCHO_PCIAFSR_RESV3	0x0000000001ffffffULL

irqreturn_t psycho_pcierr_intr(int irq, void *dev_id)
{
	struct pci_pbm_info *pbm = dev_id;
	u64 afsr, afar, error_bits;
	int reported;

	afsr = upa_readq(pbm->pci_afsr);
	afar = upa_readq(pbm->pci_afar);
	error_bits = afsr &
		(PSYCHO_PCIAFSR_PMA | PSYCHO_PCIAFSR_PTA |
		 PSYCHO_PCIAFSR_PRTRY | PSYCHO_PCIAFSR_PPERR |
		 PSYCHO_PCIAFSR_SMA | PSYCHO_PCIAFSR_STA |
		 PSYCHO_PCIAFSR_SRTRY | PSYCHO_PCIAFSR_SPERR);
	if (!error_bits)
		return psycho_pcierr_intr_other(pbm);
	upa_writeq(error_bits, pbm->pci_afsr);
	printk(KERN_ERR "%s: PCI Error, primary error type[%s]\n",
	       pbm->name,
	       (((error_bits & PSYCHO_PCIAFSR_PMA) ?
		 "Master Abort" :
		 ((error_bits & PSYCHO_PCIAFSR_PTA) ?
		  "Target Abort" :
		  ((error_bits & PSYCHO_PCIAFSR_PRTRY) ?
		   "Excessive Retries" :
		   ((error_bits & PSYCHO_PCIAFSR_PPERR) ?
		    "Parity Error" : "???"))))));
	printk(KERN_ERR "%s: bytemask[%04llx] UPA_MID[%02llx] was_block(%d)\n",
	       pbm->name,
	       (afsr & PSYCHO_PCIAFSR_BMSK) >> 32UL,
	       (afsr & PSYCHO_PCIAFSR_MID) >> 25UL,
	       (afsr & PSYCHO_PCIAFSR_BLK) ? 1 : 0);
	printk(KERN_ERR "%s: PCI AFAR [%016llx]\n", pbm->name, afar);
	printk(KERN_ERR "%s: PCI Secondary errors [", pbm->name);
	reported = 0;
	if (afsr & PSYCHO_PCIAFSR_SMA) {
		reported++;
		printk("(Master Abort)");
	}
	if (afsr & PSYCHO_PCIAFSR_STA) {
		reported++;
		printk("(Target Abort)");
	}
	if (afsr & PSYCHO_PCIAFSR_SRTRY) {
		reported++;
		printk("(Excessive Retries)");
	}
	if (afsr & PSYCHO_PCIAFSR_SPERR) {
		reported++;
		printk("(Parity Error)");
	}
	if (!reported)
		printk("(none)");
	printk("]\n");

	if (error_bits & (PSYCHO_PCIAFSR_PTA | PSYCHO_PCIAFSR_STA)) {
		psycho_check_iommu_error(pbm, afsr, afar, PCI_ERR);
		pci_scan_for_target_abort(pbm, pbm->pci_bus);
	}
	if (error_bits & (PSYCHO_PCIAFSR_PMA | PSYCHO_PCIAFSR_SMA))
		pci_scan_for_master_abort(pbm, pbm->pci_bus);

	if (error_bits & (PSYCHO_PCIAFSR_PPERR | PSYCHO_PCIAFSR_SPERR))
		pci_scan_for_parity_error(pbm, pbm->pci_bus);

	return IRQ_HANDLED;
}

static void psycho_iommu_flush(struct pci_pbm_info *pbm)
{
	int i;

	for (i = 0; i < 16; i++) {
		unsigned long off = i * 8;

		upa_writeq(0, pbm->controller_regs + PSYCHO_IOMMU_TAG + off);
		upa_writeq(0, pbm->controller_regs + PSYCHO_IOMMU_DATA + off);
	}
}

#define PSYCHO_IOMMU_CONTROL		0x0200UL
#define  PSYCHO_IOMMU_CTRL_TSBSZ	0x0000000000070000UL
#define  PSYCHO_IOMMU_TSBSZ_1K      	0x0000000000000000UL
#define  PSYCHO_IOMMU_TSBSZ_2K      	0x0000000000010000UL
#define  PSYCHO_IOMMU_TSBSZ_4K      	0x0000000000020000UL
#define  PSYCHO_IOMMU_TSBSZ_8K      	0x0000000000030000UL
#define  PSYCHO_IOMMU_TSBSZ_16K     	0x0000000000040000UL
#define  PSYCHO_IOMMU_TSBSZ_32K     	0x0000000000050000UL
#define  PSYCHO_IOMMU_TSBSZ_64K     	0x0000000000060000UL
#define  PSYCHO_IOMMU_TSBSZ_128K    	0x0000000000070000UL
#define  PSYCHO_IOMMU_CTRL_TBWSZ    	0x0000000000000004UL
#define  PSYCHO_IOMMU_CTRL_DENAB    	0x0000000000000002UL
#define  PSYCHO_IOMMU_CTRL_ENAB     	0x0000000000000001UL
#define PSYCHO_IOMMU_FLUSH		0x0210UL
#define PSYCHO_IOMMU_TSBBASE		0x0208UL

int psycho_iommu_init(struct pci_pbm_info *pbm, int tsbsize,
		      u32 dvma_offset, u32 dma_mask,
		      unsigned long write_complete_offset)
{
	struct iommu *iommu = pbm->iommu;
	u64 control;
	int err;

	iommu->iommu_control  = pbm->controller_regs + PSYCHO_IOMMU_CONTROL;
	iommu->iommu_tsbbase  = pbm->controller_regs + PSYCHO_IOMMU_TSBBASE;
	iommu->iommu_flush    = pbm->controller_regs + PSYCHO_IOMMU_FLUSH;
	iommu->iommu_tags     = pbm->controller_regs + PSYCHO_IOMMU_TAG;
	iommu->write_complete_reg = (pbm->controller_regs +
				     write_complete_offset);

	iommu->iommu_ctxflush = 0;

	control = upa_readq(iommu->iommu_control);
	control |= PSYCHO_IOMMU_CTRL_DENAB;
	upa_writeq(control, iommu->iommu_control);

	psycho_iommu_flush(pbm);

	/* Leave diag mode enabled for full-flushing done in pci_iommu.c */
	err = iommu_table_init(iommu, tsbsize * 1024 * 8,
			       dvma_offset, dma_mask, pbm->numa_node);
	if (err)
		return err;

	upa_writeq(__pa(iommu->page_table), iommu->iommu_tsbbase);

	control = upa_readq(iommu->iommu_control);
	control &= ~(PSYCHO_IOMMU_CTRL_TSBSZ | PSYCHO_IOMMU_CTRL_TBWSZ);
	control |= PSYCHO_IOMMU_CTRL_ENAB;

	switch (tsbsize) {
	case 64:
		control |= PSYCHO_IOMMU_TSBSZ_64K;
		break;
	case 128:
		control |= PSYCHO_IOMMU_TSBSZ_128K;
		break;
	default:
		return -EINVAL;
	}

	upa_writeq(control, iommu->iommu_control);

	return 0;

}

void psycho_pbm_init_common(struct pci_pbm_info *pbm, struct platform_device *op,
			    const char *chip_name, int chip_type)
{
	struct device_node *dp = op->dev.of_node;

	pbm->name = dp->full_name;
	pbm->numa_node = -1;
	pbm->chip_type = chip_type;
	pbm->chip_version = of_getintprop_default(dp, "version#", 0);
	pbm->chip_revision = of_getintprop_default(dp, "module-revision#", 0);
	pbm->op = op;
	pbm->pci_ops = &sun4u_pci_ops;
	pbm->config_space_reg_bits = 8;
	pbm->index = pci_num_pbms++;
	pci_get_pbm_props(pbm);
	pci_determine_mem_io_space(pbm);

	printk(KERN_INFO "%s: %s PCI Bus Module ver[%x:%x]\n",
	       pbm->name, chip_name,
	       pbm->chip_version, pbm->chip_revision);
}
/a> 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369
/* ldc.c: Logical Domain Channel link-layer protocol driver.
 *
 * Copyright (C) 2007, 2008 David S. Miller <davem@davemloft.net>
 */

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/delay.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/scatterlist.h>
#include <linux/interrupt.h>
#include <linux/list.h>
#include <linux/init.h>
#include <linux/bitmap.h>

#include <asm/hypervisor.h>
#include <asm/iommu.h>
#include <asm/page.h>
#include <asm/ldc.h>
#include <asm/mdesc.h>

#define DRV_MODULE_NAME		"ldc"
#define PFX DRV_MODULE_NAME	": "
#define DRV_MODULE_VERSION	"1.1"
#define DRV_MODULE_RELDATE	"July 22, 2008"

static char version[] __devinitdata =
	DRV_MODULE_NAME ".c:v" DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")\n";
#define LDC_PACKET_SIZE		64

/* Packet header layout for unreliable and reliable mode frames.
 * When in RAW mode, packets are simply straight 64-byte payloads
 * with no headers.
 */
struct ldc_packet {
	u8			type;
#define LDC_CTRL		0x01
#define LDC_DATA		0x02
#define LDC_ERR			0x10

	u8			stype;
#define LDC_INFO		0x01
#define LDC_ACK			0x02
#define LDC_NACK		0x04

	u8			ctrl;
#define LDC_VERS		0x01 /* Link Version		*/
#define LDC_RTS			0x02 /* Request To Send		*/
#define LDC_RTR			0x03 /* Ready To Receive	*/
#define LDC_RDX			0x04 /* Ready for Data eXchange	*/
#define LDC_CTRL_MSK		0x0f

	u8			env;
#define LDC_LEN			0x3f
#define LDC_FRAG_MASK		0xc0
#define LDC_START		0x40
#define LDC_STOP		0x80

	u32			seqid;

	union {
		u8		u_data[LDC_PACKET_SIZE - 8];
		struct {
			u32	pad;
			u32	ackid;
			u8	r_data[LDC_PACKET_SIZE - 8 - 8];
		} r;
	} u;
};

struct ldc_version {
	u16 major;
	u16 minor;
};

/* Ordered from largest major to lowest.  */
static struct ldc_version ver_arr[] = {
	{ .major = 1, .minor = 0 },
};

#define LDC_DEFAULT_MTU			(4 * LDC_PACKET_SIZE)
#define LDC_DEFAULT_NUM_ENTRIES		(PAGE_SIZE / LDC_PACKET_SIZE)

struct ldc_channel;

struct ldc_mode_ops {
	int (*write)(struct ldc_channel *, const void *, unsigned int);
	int (*read)(struct ldc_channel *, void *, unsigned int);
};

static const struct ldc_mode_ops raw_ops;
static const struct ldc_mode_ops nonraw_ops;
static const struct ldc_mode_ops stream_ops;

int ldom_domaining_enabled;

struct ldc_iommu {
	/* Protects arena alloc/free.  */
	spinlock_t			lock;
	struct iommu_arena		arena;
	struct ldc_mtable_entry		*page_table;
};

struct ldc_channel {
	/* Protects all operations that depend upon channel state.  */
	spinlock_t			lock;

	unsigned long			id;

	u8				*mssbuf;
	u32				mssbuf_len;
	u32				mssbuf_off;

	struct ldc_packet		*tx_base;
	unsigned long			tx_head;
	unsigned long			tx_tail;
	unsigned long			tx_num_entries;
	unsigned long			tx_ra;

	unsigned long			tx_acked;

	struct ldc_packet		*rx_base;
	unsigned long			rx_head;
	unsigned long			rx_tail;
	unsigned long			rx_num_entries;
	unsigned long			rx_ra;

	u32				rcv_nxt;
	u32				snd_nxt;

	unsigned long			chan_state;

	struct ldc_channel_config	cfg;
	void				*event_arg;

	const struct ldc_mode_ops	*mops;

	struct ldc_iommu		iommu;

	struct ldc_version		ver;

	u8				hs_state;
#define LDC_HS_CLOSED			0x00
#define LDC_HS_OPEN			0x01
#define LDC_HS_GOTVERS			0x02
#define LDC_HS_SENTRTR			0x03
#define LDC_HS_GOTRTR			0x04
#define LDC_HS_COMPLETE			0x10

	u8				flags;
#define LDC_FLAG_ALLOCED_QUEUES		0x01
#define LDC_FLAG_REGISTERED_QUEUES	0x02
#define LDC_FLAG_REGISTERED_IRQS	0x04
#define LDC_FLAG_RESET			0x10

	u8				mss;
	u8				state;

#define LDC_IRQ_NAME_MAX		32
	char				rx_irq_name[LDC_IRQ_NAME_MAX];
	char				tx_irq_name[LDC_IRQ_NAME_MAX];

	struct hlist_head		mh_list;

	struct hlist_node		list;
};

#define ldcdbg(TYPE, f, a...) \
do {	if (lp->cfg.debug & LDC_DEBUG_##TYPE) \
		printk(KERN_INFO PFX "ID[%lu] " f, lp->id, ## a); \
} while (0)

static const char *state_to_str(u8 state)
{
	switch (state) {
	case LDC_STATE_INVALID:
		return "INVALID";
	case LDC_STATE_INIT:
		return "INIT";
	case LDC_STATE_BOUND:
		return "BOUND";
	case LDC_STATE_READY:
		return "READY";
	case LDC_STATE_CONNECTED:
		return "CONNECTED";
	default:
		return "<UNKNOWN>";
	}
}

static void ldc_set_state(struct ldc_channel *lp, u8 state)
{
	ldcdbg(STATE, "STATE (%s) --> (%s)\n",
	       state_to_str(lp->state),
	       state_to_str(state));

	lp->state = state;
}

static unsigned long __advance(unsigned long off, unsigned long num_entries)
{
	off += LDC_PACKET_SIZE;
	if (off == (num_entries * LDC_PACKET_SIZE))
		off = 0;

	return off;
}

static unsigned long rx_advance(struct ldc_channel *lp, unsigned long off)
{
	return __advance(off, lp->rx_num_entries);
}

static unsigned long tx_advance(struct ldc_channel *lp, unsigned long off)
{
	return __advance(off, lp->tx_num_entries);
}

static struct ldc_packet *handshake_get_tx_packet(struct ldc_channel *lp,
						  unsigned long *new_tail)
{
	struct ldc_packet *p;
	unsigned long t;

	t = tx_advance(lp, lp->tx_tail);
	if (t == lp->tx_head)
		return NULL;

	*new_tail = t;

	p = lp->tx_base;
	return p + (lp->tx_tail / LDC_PACKET_SIZE);
}

/* When we are in reliable or stream mode, have to track the next packet
 * we haven't gotten an ACK for in the TX queue using tx_acked.  We have
 * to be careful not to stomp over the queue past that point.  During
 * the handshake, we don't have TX data packets pending in the queue
 * and that's why handshake_get_tx_packet() need not be mindful of
 * lp->tx_acked.
 */
static unsigned long head_for_data(struct ldc_channel *lp)
{
	if (lp->cfg.mode == LDC_MODE_STREAM)
		return lp->tx_acked;
	return lp->tx_head;
}

static int tx_has_space_for(struct ldc_channel *lp, unsigned int size)
{
	unsigned long limit, tail, new_tail, diff;
	unsigned int mss;

	limit = head_for_data(lp);
	tail = lp->tx_tail;
	new_tail = tx_advance(lp, tail);
	if (new_tail == limit)
		return 0;

	if (limit > new_tail)
		diff = limit - new_tail;
	else
		diff = (limit +
			((lp->tx_num_entries * LDC_PACKET_SIZE) - new_tail));
	diff /= LDC_PACKET_SIZE;
	mss = lp->mss;

	if (diff * mss < size)
		return 0;

	return 1;
}

static struct ldc_packet *data_get_tx_packet(struct ldc_channel *lp,
					     unsigned long *new_tail)
{
	struct ldc_packet *p;
	unsigned long h, t;

	h = head_for_data(lp);
	t = tx_advance(lp, lp->tx_tail);
	if (t == h)
		return NULL;

	*new_tail = t;

	p = lp->tx_base;
	return p + (lp->tx_tail / LDC_PACKET_SIZE);
}

static int set_tx_tail(struct ldc_channel *lp, unsigned long tail)
{
	unsigned long orig_tail = lp->tx_tail;
	int limit = 1000;

	lp->tx_tail = tail;
	while (limit-- > 0) {
		unsigned long err;

		err = sun4v_ldc_tx_set_qtail(lp->id, tail);
		if (!err)
			return 0;

		if (err != HV_EWOULDBLOCK) {
			lp->tx_tail = orig_tail;
			return -EINVAL;
		}
		udelay(1);
	}

	lp->tx_tail = orig_tail;
	return -EBUSY;
}

/* This just updates the head value in the hypervisor using
 * a polling loop with a timeout.  The caller takes care of
 * upating software state representing the head change, if any.
 */
static int __set_rx_head(struct ldc_channel *lp, unsigned long head)
{
	int limit = 1000;

	while (limit-- > 0) {
		unsigned long err;

		err = sun4v_ldc_rx_set_qhead(lp->id, head);
		if (!err)
			return 0;

		if (err != HV_EWOULDBLOCK)
			return -EINVAL;

		udelay(1);
	}

	return -EBUSY;
}

static int send_tx_packet(struct ldc_channel *lp,
			  struct ldc_packet *p,
			  unsigned long new_tail)
{
	BUG_ON(p != (lp->tx_base + (lp->tx_tail / LDC_PACKET_SIZE)));

	return set_tx_tail(lp, new_tail);
}

static struct ldc_packet *handshake_compose_ctrl(struct ldc_channel *lp,
						 u8 stype, u8 ctrl,
						 void *data, int dlen,
						 unsigned long *new_tail)
{
	struct ldc_packet *p = handshake_get_tx_packet(lp, new_tail);

	if (p) {
		memset(p, 0, sizeof(*p));
		p->type = LDC_CTRL;
		p->stype = stype;
		p->ctrl = ctrl;
		if (data)
			memcpy(p->u.u_data, data, dlen);
	}
	return p;
}

static int start_handshake(struct ldc_channel *lp)
{
	struct ldc_packet *p;
	struct ldc_version *ver;
	unsigned long new_tail;

	ver = &ver_arr[0];

	ldcdbg(HS, "SEND VER INFO maj[%u] min[%u]\n",
	       ver->major, ver->minor);

	p = handshake_compose_ctrl(lp, LDC_INFO, LDC_VERS,
				   ver, sizeof(*ver), &new_tail);
	if (p) {
		int err = send_tx_packet(lp, p, new_tail);
		if (!err)
			lp->flags &= ~LDC_FLAG_RESET;
		return err;
	}
	return -EBUSY;
}

static int send_version_nack(struct ldc_channel *lp,
			     u16 major, u16 minor)
{
	struct ldc_packet *p;
	struct ldc_version ver;
	unsigned long new_tail;

	ver.major = major;
	ver.minor = minor;

	p = handshake_compose_ctrl(lp, LDC_NACK, LDC_VERS,
				   &ver, sizeof(ver), &new_tail);
	if (p) {
		ldcdbg(HS, "SEND VER NACK maj[%u] min[%u]\n",
		       ver.major, ver.minor);

		return send_tx_packet(lp, p, new_tail);
	}
	return -EBUSY;
}

static int send_version_ack(struct ldc_channel *lp,
			    struct ldc_version *vp)
{
	struct ldc_packet *p;
	unsigned long new_tail;

	p = handshake_compose_ctrl(lp, LDC_ACK, LDC_VERS,
				   vp, sizeof(*vp), &new_tail);
	if (p) {
		ldcdbg(HS, "SEND VER ACK maj[%u] min[%u]\n",
		       vp->major, vp->minor);

		return send_tx_packet(lp, p, new_tail);
	}
	return -EBUSY;
}

static int send_rts(struct ldc_channel *lp)
{
	struct ldc_packet *p;
	unsigned long new_tail;

	p = handshake_compose_ctrl(lp, LDC_INFO, LDC_RTS, NULL, 0,
				   &new_tail);
	if (p) {
		p->env = lp->cfg.mode;
		p->seqid = 0;
		lp->rcv_nxt = 0;

		ldcdbg(HS, "SEND RTS env[0x%x] seqid[0x%x]\n",
		       p->env, p->seqid);

		return send_tx_packet(lp, p, new_tail);
	}
	return -EBUSY;
}

static int send_rtr(struct ldc_channel *lp)
{
	struct ldc_packet *p;
	unsigned long new_tail;

	p = handshake_compose_ctrl(lp, LDC_INFO, LDC_RTR, NULL, 0,
				   &new_tail);
	if (p) {
		p->env = lp->cfg.mode;
		p->seqid = 0;

		ldcdbg(HS, "SEND RTR env[0x%x] seqid[0x%x]\n",
		       p->env, p->seqid);

		return send_tx_packet(lp, p, new_tail);
	}
	return -EBUSY;
}

static int send_rdx(struct ldc_channel *lp)
{
	struct ldc_packet *p;
	unsigned long new_tail;

	p = handshake_compose_ctrl(lp, LDC_INFO, LDC_RDX, NULL, 0,
				   &new_tail);
	if (p) {
		p->env = 0;
		p->seqid = ++lp->snd_nxt;
		p->u.r.ackid = lp->rcv_nxt;

		ldcdbg(HS, "SEND RDX env[0x%x] seqid[0x%x] ackid[0x%x]\n",
		       p->env, p->seqid, p->u.r.ackid);

		return send_tx_packet(lp, p, new_tail);
	}
	return -EBUSY;
}

static int send_data_nack(struct ldc_channel *lp, struct ldc_packet *data_pkt)
{
	struct ldc_packet *p;
	unsigned long new_tail;
	int err;

	p = data_get_tx_packet(lp, &new_tail);
	if (!p)
		return -EBUSY;
	memset(p, 0, sizeof(*p));
	p->type = data_pkt->type;
	p->stype = LDC_NACK;
	p->ctrl = data_pkt->ctrl & LDC_CTRL_MSK;
	p->seqid = lp->snd_nxt + 1;
	p->u.r.ackid = lp->rcv_nxt;

	ldcdbg(HS, "SEND DATA NACK type[0x%x] ctl[0x%x] seq[0x%x] ack[0x%x]\n",
	       p->type, p->ctrl, p->seqid, p->u.r.ackid);

	err = send_tx_packet(lp, p, new_tail);
	if (!err)
		lp->snd_nxt++;

	return err;
}

static int ldc_abort(struct ldc_channel *lp)
{
	unsigned long hv_err;

	ldcdbg(STATE, "ABORT\n");

	/* We report but do not act upon the hypervisor errors because
	 * there really isn't much we can do if they fail at this point.
	 */
	hv_err = sun4v_ldc_tx_qconf(lp->id, lp->tx_ra, lp->tx_num_entries);
	if (hv_err)
		printk(KERN_ERR PFX "ldc_abort: "
		       "sun4v_ldc_tx_qconf(%lx,%lx,%lx) failed, err=%lu\n",
		       lp->id, lp->tx_ra, lp->tx_num_entries, hv_err);

	hv_err = sun4v_ldc_tx_get_state(lp->id,
					&lp->tx_head,
					&lp->tx_tail,
					&lp->chan_state);
	if (hv_err)
		printk(KERN_ERR PFX "ldc_abort: "
		       "sun4v_ldc_tx_get_state(%lx,...) failed, err=%lu\n",
		       lp->id, hv_err);

	hv_err = sun4v_ldc_rx_qconf(lp->id, lp->rx_ra, lp->rx_num_entries);
	if (hv_err)
		printk(KERN_ERR PFX "ldc_abort: "
		       "sun4v_ldc_rx_qconf(%lx,%lx,%lx) failed, err=%lu\n",
		       lp->id, lp->rx_ra, lp->rx_num_entries, hv_err);

	/* Refetch the RX queue state as well, because we could be invoked
	 * here in the queue processing context.
	 */
	hv_err = sun4v_ldc_rx_get_state(lp->id,
					&lp->rx_head,
					&lp->rx_tail,
					&lp->chan_state);
	if (hv_err)
		printk(KERN_ERR PFX "ldc_abort: "
		       "sun4v_ldc_rx_get_state(%lx,...) failed, err=%lu\n",
		       lp->id, hv_err);

	return -ECONNRESET;
}

static struct ldc_version *find_by_major(u16 major)
{
	struct ldc_version *ret = NULL;
	int i;

	for (i = 0; i < ARRAY_SIZE(ver_arr); i++) {
		struct ldc_version *v = &ver_arr[i];
		if (v->major <= major) {
			ret = v;
			break;
		}
	}
	return ret;
}

static int process_ver_info(struct ldc_channel *lp, struct ldc_version *vp)
{
	struct ldc_version *vap;
	int err;

	ldcdbg(HS, "GOT VERSION INFO major[%x] minor[%x]\n",
	       vp->major, vp->minor);

	if (lp->hs_state == LDC_HS_GOTVERS) {
		lp->hs_state = LDC_HS_OPEN;
		memset(&lp->ver, 0, sizeof(lp->ver));
	}

	vap = find_by_major(vp->major);
	if (!vap) {
		err = send_version_nack(lp, 0, 0);
	} else if (vap->major != vp->major) {
		err = send_version_nack(lp, vap->major, vap->minor);
	} else {
		struct ldc_version ver = *vp;
		if (ver.minor > vap->minor)
			ver.minor = vap->minor;
		err = send_version_ack(lp, &ver);
		if (!err) {
			lp->ver = ver;
			lp->hs_state = LDC_HS_GOTVERS;
		}
	}
	if (err)
		return ldc_abort(lp);

	return 0;
}

static int process_ver_ack(struct ldc_channel *lp, struct ldc_version *vp)
{
	ldcdbg(HS, "GOT VERSION ACK major[%x] minor[%x]\n",
	       vp->major, vp->minor);

	if (lp->hs_state == LDC_HS_GOTVERS) {
		if (lp->ver.major != vp->major ||
		    lp->ver.minor != vp->minor)
			return ldc_abort(lp);
	} else {
		lp->ver = *vp;
		lp->hs_state = LDC_HS_GOTVERS;
	}
	if (send_rts(lp))
		return ldc_abort(lp);
	return 0;
}

static int process_ver_nack(struct ldc_channel *lp, struct ldc_version *vp)
{
	struct ldc_version *vap;
	struct ldc_packet *p;
	unsigned long new_tail;

	if (vp->major == 0 && vp->minor == 0)
		return ldc_abort(lp);

	vap = find_by_major(vp->major);
	if (!vap)
		return ldc_abort(lp);

	p = handshake_compose_ctrl(lp, LDC_INFO, LDC_VERS,
					   vap, sizeof(*vap),
					   &new_tail);
	if (!p)
		return ldc_abort(lp);

	return send_tx_packet(lp, p, new_tail);
}

static int process_version(struct ldc_channel *lp,
			   struct ldc_packet *p)
{
	struct ldc_version *vp;

	vp = (struct ldc_version *) p->u.u_data;

	switch (p->stype) {
	case LDC_INFO:
		return process_ver_info(lp, vp);

	case LDC_ACK:
		return process_ver_ack(lp, vp);

	case LDC_NACK:
		return process_ver_nack(lp, vp);

	default:
		return ldc_abort(lp);
	}
}

static int process_rts(struct ldc_channel *lp,
		       struct ldc_packet *p)
{
	ldcdbg(HS, "GOT RTS stype[%x] seqid[%x] env[%x]\n",
	       p->stype, p->seqid, p->env);

	if (p->stype     != LDC_INFO	   ||
	    lp->hs_state != LDC_HS_GOTVERS ||
	    p->env       != lp->cfg.mode)
		return ldc_abort(lp);

	lp->snd_nxt = p->seqid;
	lp->rcv_nxt = p->seqid;
	lp->hs_state = LDC_HS_SENTRTR;
	if (send_rtr(lp))
		return ldc_abort(lp);

	return 0;
}

static int process_rtr(struct ldc_channel *lp,
		       struct ldc_packet *p)
{
	ldcdbg(HS, "GOT RTR stype[%x] seqid[%x] env[%x]\n",
	       p->stype, p->seqid, p->env);

	if (p->stype     != LDC_INFO ||
	    p->env       != lp->cfg.mode)
		return ldc_abort(lp);

	lp->snd_nxt = p->seqid;
	lp->hs_state = LDC_HS_COMPLETE;
	ldc_set_state(lp, LDC_STATE_CONNECTED);
	send_rdx(lp);

	return LDC_EVENT_UP;
}

static int rx_seq_ok(struct ldc_channel *lp, u32 seqid)
{
	return lp->rcv_nxt + 1 == seqid;
}

static int process_rdx(struct ldc_channel *lp,
		       struct ldc_packet *p)
{
	ldcdbg(HS, "GOT RDX stype[%x] seqid[%x] env[%x] ackid[%x]\n",
	       p->stype, p->seqid, p->env, p->u.r.ackid);

	if (p->stype != LDC_INFO ||
	    !(rx_seq_ok(lp, p->seqid)))
		return ldc_abort(lp);

	lp->rcv_nxt = p->seqid;

	lp->hs_state = LDC_HS_COMPLETE;
	ldc_set_state(lp, LDC_STATE_CONNECTED);

	return LDC_EVENT_UP;
}

static int process_control_frame(struct ldc_channel *lp,
				 struct ldc_packet *p)
{
	switch (p->ctrl) {
	case LDC_VERS:
		return process_version(lp, p);

	case LDC_RTS:
		return process_rts(lp, p);

	case LDC_RTR:
		return process_rtr(lp, p);

	case LDC_RDX:
		return process_rdx(lp, p);

	default:
		return ldc_abort(lp);
	}
}

static int process_error_frame(struct ldc_channel *lp,
			       struct ldc_packet *p)
{
	return ldc_abort(lp);
}

static int process_data_ack(struct ldc_channel *lp,
			    struct ldc_packet *ack)
{
	unsigned long head = lp->tx_acked;
	u32 ackid = ack->u.r.ackid;

	while (1) {
		struct ldc_packet *p = lp->tx_base + (head / LDC_PACKET_SIZE);

		head = tx_advance(lp, head);

		if (p->seqid == ackid) {
			lp->tx_acked = head;
			return 0;
		}
		if (head == lp->tx_tail)
			return ldc_abort(lp);
	}

	return 0;
}

static void send_events(struct ldc_channel *lp, unsigned int event_mask)
{
	if (event_mask & LDC_EVENT_RESET)
		lp->cfg.event(lp->event_arg, LDC_EVENT_RESET);
	if (event_mask & LDC_EVENT_UP)
		lp->cfg.event(lp->event_arg, LDC_EVENT_UP);
	if (event_mask & LDC_EVENT_DATA_READY)
		lp->cfg.event(lp->event_arg, LDC_EVENT_DATA_READY);
}

static irqreturn_t ldc_rx(int irq, void *dev_id)
{
	struct ldc_channel *lp = dev_id;
	unsigned long orig_state, hv_err, flags;
	unsigned int event_mask;

	spin_lock_irqsave(&lp->lock, flags);

	orig_state = lp->chan_state;
	hv_err = sun4v_ldc_rx_get_state(lp->id,
					&lp->rx_head,
					&lp->rx_tail,
					&lp->chan_state);

	ldcdbg(RX, "RX state[0x%02lx:0x%02lx] head[0x%04lx] tail[0x%04lx]\n",
	       orig_state, lp->chan_state, lp->rx_head, lp->rx_tail);

	event_mask = 0;

	if (lp->cfg.mode == LDC_MODE_RAW &&
	    lp->chan_state == LDC_CHANNEL_UP) {
		lp->hs_state = LDC_HS_COMPLETE;
		ldc_set_state(lp, LDC_STATE_CONNECTED);

		event_mask |= LDC_EVENT_UP;

		orig_state = lp->chan_state;
	}

	/* If we are in reset state, flush the RX queue and ignore
	 * everything.
	 */
	if (lp->flags & LDC_FLAG_RESET) {
		(void) __set_rx_head(lp, lp->rx_tail);
		goto out;
	}

	/* Once we finish the handshake, we let the ldc_read()
	 * paths do all of the control frame and state management.
	 * Just trigger the callback.
	 */
	if (lp->hs_state == LDC_HS_COMPLETE) {
handshake_complete:
		if (lp->chan_state != orig_state) {
			unsigned int event = LDC_EVENT_RESET;

			if (lp->chan_state == LDC_CHANNEL_UP)
				event = LDC_EVENT_UP;

			event_mask |= event;
		}
		if (lp->rx_head != lp->rx_tail)
			event_mask |= LDC_EVENT_DATA_READY;

		goto out;
	}

	if (lp->chan_state != orig_state)
		goto out;

	while (lp->rx_head != lp->rx_tail) {
		struct ldc_packet *p;
		unsigned long new;
		int err;

		p = lp->rx_base + (lp->rx_head / LDC_PACKET_SIZE);

		switch (p->type) {
		case LDC_CTRL:
			err = process_control_frame(lp, p);
			if (err > 0)
				event_mask |= err;
			break;

		case LDC_DATA:
			event_mask |= LDC_EVENT_DATA_READY;
			err = 0;
			break;

		case LDC_ERR:
			err = process_error_frame(lp, p);
			break;

		default:
			err = ldc_abort(lp);
			break;
		}

		if (err < 0)
			break;

		new = lp->rx_head;
		new += LDC_PACKET_SIZE;
		if (new == (lp->rx_num_entries * LDC_PACKET_SIZE))
			new = 0;
		lp->rx_head = new;

		err = __set_rx_head(lp, new);
		if (err < 0) {
			(void) ldc_abort(lp);
			break;
		}
		if (lp->hs_state == LDC_HS_COMPLETE)
			goto handshake_complete;
	}

out:
	spin_unlock_irqrestore(&lp->lock, flags);

	send_events(lp, event_mask);

	return IRQ_HANDLED;
}

static irqreturn_t ldc_tx(int irq, void *dev_id)
{
	struct ldc_channel *lp = dev_id;
	unsigned long flags, hv_err, orig_state;
	unsigned int event_mask = 0;

	spin_lock_irqsave(&lp->lock, flags);

	orig_state = lp->chan_state;
	hv_err = sun4v_ldc_tx_get_state(lp->id,
					&lp->tx_head,
					&lp->tx_tail,
					&lp->chan_state);

	ldcdbg(TX, " TX state[0x%02lx:0x%02lx] head[0x%04lx] tail[0x%04lx]\n",
	       orig_state, lp->chan_state, lp->tx_head, lp->tx_tail);

	if (lp->cfg.mode == LDC_MODE_RAW &&
	    lp->chan_state == LDC_CHANNEL_UP) {
		lp->hs_state = LDC_HS_COMPLETE;
		ldc_set_state(lp, LDC_STATE_CONNECTED);

		event_mask |= LDC_EVENT_UP;
	}

	spin_unlock_irqrestore(&lp->lock, flags);

	send_events(lp, event_mask);

	return IRQ_HANDLED;
}

/* XXX ldc_alloc() and ldc_free() needs to run under a mutex so
 * XXX that addition and removal from the ldc_channel_list has
 * XXX atomicity, otherwise the __ldc_channel_exists() check is
 * XXX totally pointless as another thread can slip into ldc_alloc()
 * XXX and add a channel with the same ID.  There also needs to be
 * XXX a spinlock for ldc_channel_list.
 */
static HLIST_HEAD(ldc_channel_list);

static int __ldc_channel_exists(unsigned long id)
{
	struct ldc_channel *lp;
	struct hlist_node *n;

	hlist_for_each_entry(lp, n, &ldc_channel_list, list) {
		if (lp->id == id)
			return 1;
	}
	return 0;
}

static int alloc_queue(const char *name, unsigned long num_entries,
		       struct ldc_packet **base, unsigned long *ra)
{
	unsigned long size, order;
	void *q;

	size = num_entries * LDC_PACKET_SIZE;
	order = get_order(size);

	q = (void *) __get_free_pages(GFP_KERNEL, order);
	if (!q) {
		printk(KERN_ERR PFX "Alloc of %s queue failed with "
		       "size=%lu order=%lu\n", name, size, order);
		return -ENOMEM;
	}

	memset(q, 0, PAGE_SIZE << order);

	*base = q;
	*ra = __pa(q);

	return 0;
}

static void free_queue(unsigned long num_entries, struct ldc_packet *q)
{
	unsigned long size, order;

	if (!q)
		return;

	size = num_entries * LDC_PACKET_SIZE;
	order = get_order(size);

	free_pages((unsigned long)q, order);
}

/* XXX Make this configurable... XXX */
#define LDC_IOTABLE_SIZE	(8 * 1024)

static int ldc_iommu_init(struct ldc_channel *lp)
{
	unsigned long sz, num_tsb_entries, tsbsize, order;
	struct ldc_iommu *iommu = &lp->iommu;
	struct ldc_mtable_entry *table;
	unsigned long hv_err;
	int err;

	num_tsb_entries = LDC_IOTABLE_SIZE;
	tsbsize = num_tsb_entries * sizeof(struct ldc_mtable_entry);

	spin_lock_init(&iommu->lock);

	sz = num_tsb_entries / 8;
	sz = (sz + 7UL) & ~7UL;
	iommu->arena.map = kzalloc(sz, GFP_KERNEL);
	if (!iommu->arena.map) {
		printk(KERN_ERR PFX "Alloc of arena map failed, sz=%lu\n", sz);
		return -ENOMEM;
	}

	iommu->arena.limit = num_tsb_entries;

	order = get_order(tsbsize);

	table = (struct ldc_mtable_entry *)
		__get_free_pages(GFP_KERNEL, order);
	err = -ENOMEM;
	if (!table) {
		printk(KERN_ERR PFX "Alloc of MTE table failed, "
		       "size=%lu order=%lu\n", tsbsize, order);
		goto out_free_map;
	}

	memset(table, 0, PAGE_SIZE << order);

	iommu->page_table = table;

	hv_err = sun4v_ldc_set_map_table(lp->id, __pa(table),
					 num_tsb_entries);
	err = -EINVAL;
	if (hv_err)
		goto out_free_table;

	return 0;

out_free_table:
	free_pages((unsigned long) table, order);
	iommu->page_table = NULL;

out_free_map:
	kfree(iommu->arena.map);
	iommu->arena.map = NULL;

	return err;
}

static void ldc_iommu_release(struct ldc_channel *lp)
{
	struct ldc_iommu *iommu = &lp->iommu;
	unsigned long num_tsb_entries, tsbsize, order;

	(void) sun4v_ldc_set_map_table(lp->id, 0, 0);

	num_tsb_entries = iommu->arena.limit;
	tsbsize = num_tsb_entries * sizeof(struct ldc_mtable_entry);
	order = get_order(tsbsize);

	free_pages((unsigned long) iommu->page_table, order);
	iommu->page_table = NULL;

	kfree(iommu->arena.map);
	iommu->arena.map = NULL;
}

struct ldc_channel *ldc_alloc(unsigned long id,
			      const struct ldc_channel_config *cfgp,
			      void *event_arg)
{
	struct ldc_channel *lp;
	const struct ldc_mode_ops *mops;
	unsigned long dummy1, dummy2, hv_err;
	u8 mss, *mssbuf;
	int err;

	err = -ENODEV;
	if (!ldom_domaining_enabled)
		goto out_err;

	err = -EINVAL;
	if (!cfgp)
		goto out_err;

	switch (cfgp->mode) {
	case LDC_MODE_RAW:
		mops = &raw_ops;
		mss = LDC_PACKET_SIZE;
		break;

	case LDC_MODE_UNRELIABLE:
		mops = &nonraw_ops;
		mss = LDC_PACKET_SIZE - 8;
		break;

	case LDC_MODE_STREAM:
		mops = &stream_ops;
		mss = LDC_PACKET_SIZE - 8 - 8;
		break;

	default:
		goto out_err;
	}

	if (!cfgp->event || !event_arg || !cfgp->rx_irq || !cfgp->tx_irq)
		goto out_err;

	hv_err = sun4v_ldc_tx_qinfo(id, &dummy1, &dummy2);
	err = -ENODEV;
	if (hv_err == HV_ECHANNEL)
		goto out_err;

	err = -EEXIST;
	if (__ldc_channel_exists(id))
		goto out_err;

	mssbuf = NULL;

	lp = kzalloc(sizeof(*lp), GFP_KERNEL);
	err = -ENOMEM;
	if (!lp)
		goto out_err;

	spin_lock_init(&lp->lock);

	lp->id = id;

	err = ldc_iommu_init(lp);
	if (err)
		goto out_free_ldc;

	lp->mops = mops;
	lp->mss = mss;

	lp->cfg = *cfgp;
	if (!lp->cfg.mtu)
		lp->cfg.mtu = LDC_DEFAULT_MTU;

	if (lp->cfg.mode == LDC_MODE_STREAM) {
		mssbuf = kzalloc(lp->cfg.mtu, GFP_KERNEL);
		if (!mssbuf) {
			err = -ENOMEM;
			goto out_free_iommu;
		}
		lp->mssbuf = mssbuf;
	}

	lp->event_arg = event_arg;

	/* XXX allow setting via ldc_channel_config to override defaults
	 * XXX or use some formula based upon mtu
	 */
	lp->tx_num_entries = LDC_DEFAULT_NUM_ENTRIES;
	lp->rx_num_entries = LDC_DEFAULT_NUM_ENTRIES;

	err = alloc_queue("TX", lp->tx_num_entries,
			  &lp->tx_base, &lp->tx_ra);
	if (err)
		goto out_free_mssbuf;

	err = alloc_queue("RX", lp->rx_num_entries,
			  &lp->rx_base, &lp->rx_ra);
	if (err)
		goto out_free_txq;

	lp->flags |= LDC_FLAG_ALLOCED_QUEUES;

	lp->hs_state = LDC_HS_CLOSED;
	ldc_set_state(lp, LDC_STATE_INIT);

	INIT_HLIST_NODE(&lp->list);
	hlist_add_head(&lp->list, &ldc_channel_list);

	INIT_HLIST_HEAD(&lp->mh_list);

	return lp;

out_free_txq:
	free_queue(lp->tx_num_entries, lp->tx_base);

out_free_mssbuf:
	kfree(mssbuf);

out_free_iommu:
	ldc_iommu_release(lp);

out_free_ldc:
	kfree(lp);

out_err:
	return ERR_PTR(err);
}
EXPORT_SYMBOL(ldc_alloc);

void ldc_free(struct ldc_channel *lp)
{
	if (lp->flags & LDC_FLAG_REGISTERED_IRQS) {
		free_irq(lp->cfg.rx_irq, lp);
		free_irq(lp->cfg.tx_irq, lp);
	}

	if (lp->flags & LDC_FLAG_REGISTERED_QUEUES) {
		sun4v_ldc_tx_qconf(lp->id, 0, 0);
		sun4v_ldc_rx_qconf(lp->id, 0, 0);
		lp->flags &= ~LDC_FLAG_REGISTERED_QUEUES;
	}
	if (lp->flags & LDC_FLAG_ALLOCED_QUEUES) {
		free_queue(lp->tx_num_entries, lp->tx_base);
		free_queue(lp->rx_num_entries, lp->rx_base);
		lp->flags &= ~LDC_FLAG_ALLOCED_QUEUES;
	}

	hlist_del(&lp->list);

	kfree(lp->mssbuf);

	ldc_iommu_release(lp);

	kfree(lp);
}
EXPORT_SYMBOL(ldc_free);

/* Bind the channel.  This registers the LDC queues with
 * the hypervisor and puts the channel into a pseudo-listening
 * state.  This does not initiate a handshake, ldc_connect() does
 * that.
 */
int ldc_bind(struct ldc_channel *lp, const char *name)
{
	unsigned long hv_err, flags;
	int err = -EINVAL;

	if (!name ||
	    (lp->state != LDC_STATE_INIT))
		return -EINVAL;

	snprintf(lp->rx_irq_name, LDC_IRQ_NAME_MAX, "%s RX", name);
	snprintf(lp->tx_irq_name, LDC_IRQ_NAME_MAX, "%s TX", name);

	err = request_irq(lp->cfg.rx_irq, ldc_rx,
			  IRQF_SAMPLE_RANDOM | IRQF_DISABLED,
			  lp->rx_irq_name, lp);
	if (err)
		return err;

	err = request_irq(lp->cfg.tx_irq, ldc_tx,
			  IRQF_SAMPLE_RANDOM | IRQF_DISABLED,
			  lp->tx_irq_name, lp);
	if (err) {
		free_irq(lp->cfg.rx_irq, lp);
		return err;
	}


	spin_lock_irqsave(&lp->lock, flags);

	enable_irq(lp->cfg.rx_irq);
	enable_irq(lp->cfg.tx_irq);

	lp->flags |= LDC_FLAG_REGISTERED_IRQS;

	err = -ENODEV;
	hv_err = sun4v_ldc_tx_qconf(lp->id, 0, 0);
	if (hv_err)
		goto out_free_irqs;

	hv_err = sun4v_ldc_tx_qconf(lp->id, lp->tx_ra, lp->tx_num_entries);
	if (hv_err)
		goto out_free_irqs;

	hv_err = sun4v_ldc_rx_qconf(lp->id, 0, 0);
	if (hv_err)
		goto out_unmap_tx;

	hv_err = sun4v_ldc_rx_qconf(lp->id, lp->rx_ra, lp->rx_num_entries);
	if (hv_err)
		goto out_unmap_tx;

	lp->flags |= LDC_FLAG_REGISTERED_QUEUES;

	hv_err = sun4v_ldc_tx_get_state(lp->id,
					&lp->tx_head,
					&lp->tx_tail,
					&lp->chan_state);
	err = -EBUSY;
	if (hv_err)
		goto out_unmap_rx;

	lp->tx_acked = lp->tx_head;

	lp->hs_state = LDC_HS_OPEN;
	ldc_set_state(lp, LDC_STATE_BOUND);

	spin_unlock_irqrestore(&lp->lock, flags);

	return 0;

out_unmap_rx:
	lp->flags &= ~LDC_FLAG_REGISTERED_QUEUES;
	sun4v_ldc_rx_qconf(lp->id, 0, 0);

out_unmap_tx:
	sun4v_ldc_tx_qconf(lp->id, 0, 0);

out_free_irqs:
	lp->flags &= ~LDC_FLAG_REGISTERED_IRQS;
	free_irq(lp->cfg.tx_irq, lp);
	free_irq(lp->cfg.rx_irq, lp);

	spin_unlock_irqrestore(&lp->lock, flags);

	return err;
}
EXPORT_SYMBOL(ldc_bind);

int ldc_connect(struct ldc_channel *lp)
{
	unsigned long flags;
	int err;

	if (lp->cfg.mode == LDC_MODE_RAW)
		return -EINVAL;

	spin_lock_irqsave(&lp->lock, flags);

	if (!(lp->flags & LDC_FLAG_ALLOCED_QUEUES) ||
	    !(lp->flags & LDC_FLAG_REGISTERED_QUEUES) ||
	    lp->hs_state != LDC_HS_OPEN)
		err = -EINVAL;
	else
		err = start_handshake(lp);

	spin_unlock_irqrestore(&lp->lock, flags);

	return err;
}
EXPORT_SYMBOL(ldc_connect);

int ldc_disconnect(struct ldc_channel *lp)
{
	unsigned long hv_err, flags;
	int err;

	if (lp->cfg.mode == LDC_MODE_RAW)
		return -EINVAL;

	if (!(lp->flags & LDC_FLAG_ALLOCED_QUEUES) ||
	    !(lp->flags & LDC_FLAG_REGISTERED_QUEUES))
		return -EINVAL;

	spin_lock_irqsave(&lp->lock, flags);

	err = -ENODEV;
	hv_err = sun4v_ldc_tx_qconf(lp->id, 0, 0);
	if (hv_err)
		goto out_err;

	hv_err = sun4v_ldc_tx_qconf(lp->id, lp->tx_ra, lp->tx_num_entries);
	if (hv_err)
		goto out_err;

	hv_err = sun4v_ldc_rx_qconf(lp->id, 0, 0);
	if (hv_err)
		goto out_err;

	hv_err = sun4v_ldc_rx_qconf(lp->id, lp->rx_ra, lp->rx_num_entries);
	if (hv_err)
		goto out_err;

	ldc_set_state(lp, LDC_STATE_BOUND);
	lp->hs_state = LDC_HS_OPEN;
	lp->flags |= LDC_FLAG_RESET;

	spin_unlock_irqrestore(&lp->lock, flags);

	return 0;

out_err:
	sun4v_ldc_tx_qconf(lp->id, 0, 0);
	sun4v_ldc_rx_qconf(lp->id, 0, 0);
	free_irq(lp->cfg.tx_irq, lp);
	free_irq(lp->cfg.rx_irq, lp);
	lp->flags &= ~(LDC_FLAG_REGISTERED_IRQS |
		       LDC_FLAG_REGISTERED_QUEUES);
	ldc_set_state(lp, LDC_STATE_INIT);

	spin_unlock_irqrestore(&lp->lock, flags);

	return err;
}
EXPORT_SYMBOL(ldc_disconnect);

int ldc_state(struct ldc_channel *lp)
{
	return lp->state;
}
EXPORT_SYMBOL(ldc_state);

static int write_raw(struct ldc_channel *lp, const void *buf, unsigned int size)
{
	struct ldc_packet *p;
	unsigned long new_tail;
	int err;

	if (size > LDC_PACKET_SIZE)
		return -EMSGSIZE;

	p = data_get_tx_packet(lp, &new_tail);
	if (!p)
		return -EAGAIN;

	memcpy(p, buf, size);

	err = send_tx_packet(lp, p, new_tail);
	if (!err)
		err = size;

	return err;
}

static int read_raw(struct ldc_channel *lp, void *buf, unsigned int size)
{
	struct ldc_packet *p;
	unsigned long hv_err, new;
	int err;

	if (size < LDC_PACKET_SIZE)
		return -EINVAL;

	hv_err = sun4v_ldc_rx_get_state(lp->id,
					&lp->rx_head,
					&lp->rx_tail,
					&lp->chan_state);
	if (hv_err)
		return ldc_abort(lp);

	if (lp->chan_state == LDC_CHANNEL_DOWN ||
	    lp->chan_state == LDC_CHANNEL_RESETTING)
		return -ECONNRESET;

	if (lp->rx_head == lp->rx_tail)
		return 0;

	p = lp->rx_base + (lp->rx_head / LDC_PACKET_SIZE);
	memcpy(buf, p, LDC_PACKET_SIZE);

	new = rx_advance(lp, lp->rx_head);
	lp->rx_head = new;

	err = __set_rx_head(lp, new);
	if (err < 0)
		err = -ECONNRESET;
	else
		err = LDC_PACKET_SIZE;

	return err;
}

static const struct ldc_mode_ops raw_ops = {
	.write		=	write_raw,
	.read		=	read_raw,
};

static int write_nonraw(struct ldc_channel *lp, const void *buf,
			unsigned int size)
{
	unsigned long hv_err, tail;
	unsigned int copied;
	u32 seq;
	int err;

	hv_err = sun4v_ldc_tx_get_state(lp->id, &lp->tx_head, &lp->tx_tail,
					&lp->chan_state);
	if (unlikely(hv_err))
		return -EBUSY;

	if (unlikely(lp->chan_state != LDC_CHANNEL_UP))
		return ldc_abort(lp);

	if (!tx_has_space_for(lp, size))
		return -EAGAIN;

	seq = lp->snd_nxt;
	copied = 0;
	tail = lp->tx_tail;
	while (copied < size) {
		struct ldc_packet *p = lp->tx_base + (tail / LDC_PACKET_SIZE);
		u8 *data = ((lp->cfg.mode == LDC_MODE_UNRELIABLE) ?
			    p->u.u_data :
			    p->u.r.r_data);
		int data_len;

		p->type = LDC_DATA;
		p->stype = LDC_INFO;
		p->ctrl = 0;

		data_len = size - copied;
		if (data_len > lp->mss)
			data_len = lp->mss;

		BUG_ON(data_len > LDC_LEN);

		p->env = (data_len |
			  (copied == 0 ? LDC_START : 0) |
			  (data_len == size - copied ? LDC_STOP : 0));

		p->seqid = ++seq;

		ldcdbg(DATA, "SENT DATA [%02x:%02x:%02x:%02x:%08x]\n",
		       p->type,
		       p->stype,
		       p->ctrl,
		       p->env,
		       p->seqid);

		memcpy(data, buf, data_len);
		buf += data_len;
		copied += data_len;

		tail = tx_advance(lp, tail);
	}

	err = set_tx_tail(lp, tail);
	if (!err) {
		lp->snd_nxt = seq;
		err = size;
	}

	return err;
}

static int rx_bad_seq(struct ldc_channel *lp, struct ldc_packet *p,
		      struct ldc_packet *first_frag)
{
	int err;

	if (first_frag)
		lp->rcv_nxt = first_frag->seqid - 1;

	err = send_data_nack(lp, p);
	if (err)
		return err;

	err = __set_rx_head(lp, lp->rx_tail);
	if (err < 0)
		return ldc_abort(lp);

	return 0;
}

static int data_ack_nack(struct ldc_channel *lp, struct ldc_packet *p)
{
	if (p->stype & LDC_ACK) {
		int err = process_data_ack(lp, p);
		if (err)
			return err;
	}
	if (p->stype & LDC_NACK)
		return ldc_abort(lp);

	return 0;
}

static int rx_data_wait(struct ldc_channel *lp, unsigned long cur_head)
{
	unsigned long dummy;
	int limit = 1000;

	ldcdbg(DATA, "DATA WAIT cur_head[%lx] rx_head[%lx] rx_tail[%lx]\n",
	       cur_head, lp->rx_head, lp->rx_tail);
	while (limit-- > 0) {
		unsigned long hv_err;

		hv_err = sun4v_ldc_rx_get_state(lp->id,
						&dummy,
						&lp->rx_tail,
						&lp->chan_state);
		if (hv_err)
			return ldc_abort(lp);

		if (lp->chan_state == LDC_CHANNEL_DOWN ||
		    lp->chan_state == LDC_CHANNEL_RESETTING)
			return -ECONNRESET;

		if (cur_head != lp->rx_tail) {
			ldcdbg(DATA, "DATA WAIT DONE "
			       "head[%lx] tail[%lx] chan_state[%lx]\n",
			       dummy, lp->rx_tail, lp->chan_state);
			return 0;
		}

		udelay(1);
	}
	return -EAGAIN;
}

static int rx_set_head(struct ldc_channel *lp, unsigned long head)
{
	int err = __set_rx_head(lp, head);

	if (err < 0)
		return ldc_abort(lp);

	lp->rx_head = head;
	return 0;
}

static void send_data_ack(struct ldc_channel *lp)
{
	unsigned long new_tail;
	struct ldc_packet *p;

	p = data_get_tx_packet(lp, &new_tail);
	if (likely(p)) {
		int err;

		memset(p, 0, sizeof(*p));
		p->type = LDC_DATA;
		p->stype = LDC_ACK;
		p->ctrl = 0;
		p->seqid = lp->snd_nxt + 1;
		p->u.r.ackid = lp->rcv_nxt;

		err = send_tx_packet(lp, p, new_tail);
		if (!err)
			lp->snd_nxt++;
	}
}

static int read_nonraw(struct ldc_channel *lp, void *buf, unsigned int size)
{
	struct ldc_packet *first_frag;
	unsigned long hv_err, new;
	int err, copied;

	hv_err = sun4v_ldc_rx_get_state(lp->id,
					&lp->rx_head,
					&lp->rx_tail,
					&lp->chan_state);
	if (hv_err)
		return ldc_abort(lp);

	if (lp->chan_state == LDC_CHANNEL_DOWN ||
	    lp->chan_state == LDC_CHANNEL_RESETTING)
		return -ECONNRESET;

	if (lp->rx_head == lp->rx_tail)
		return 0;

	first_frag = NULL;
	copied = err = 0;
	new = lp->rx_head;
	while (1) {
		struct ldc_packet *p;
		int pkt_len;

		BUG_ON(new == lp->rx_tail);
		p = lp->rx_base + (new / LDC_PACKET_SIZE);

		ldcdbg(RX, "RX read pkt[%02x:%02x:%02x:%02x:%08x:%08x] "
		       "rcv_nxt[%08x]\n",
		       p->type,
		       p->stype,
		       p->ctrl,
		       p->env,
		       p->seqid,
		       p->u.r.ackid,
		       lp->rcv_nxt);

		if (unlikely(!rx_seq_ok(lp, p->seqid))) {
			err = rx_bad_seq(lp, p, first_frag);
			copied = 0;
			break;
		}

		if (p->type & LDC_CTRL) {
			err = process_control_frame(lp, p);
			if (err < 0)
				break;
			err = 0;
		}

		lp->rcv_nxt = p->seqid;

		if (!(p->type & LDC_DATA)) {
			new = rx_advance(lp, new);
			goto no_data;
		}
		if (p->stype & (LDC_ACK | LDC_NACK)) {
			err = data_ack_nack(lp, p);
			if (err)
				break;
		}
		if (!(p->stype & LDC_INFO)) {
			new = rx_advance(lp, new);
			err = rx_set_head(lp, new);
			if (err)
				break;
			goto no_data;
		}

		pkt_len = p->env & LDC_LEN;

		/* Every initial packet starts with the START bit set.
		 *
		 * Singleton packets will have both START+STOP set.
		 *
		 * Fragments will have START set in the first frame, STOP
		 * set in the last frame, and neither bit set in middle
		 * frames of the packet.
		 *
		 * Therefore if we are at the beginning of a packet and
		 * we don't see START, or we are in the middle of a fragmented
		 * packet and do see START, we are unsynchronized and should
		 * flush the RX queue.
		 */
		if ((first_frag == NULL && !(p->env & LDC_START)) ||
		    (first_frag != NULL &&  (p->env & LDC_START))) {
			if (!first_frag)
				new = rx_advance(lp, new);

			err = rx_set_head(lp, new);
			if (err)
				break;

			if (!first_frag)
				goto no_data;
		}
		if (!first_frag)
			first_frag = p;

		if (pkt_len > size - copied) {
			/* User didn't give us a big enough buffer,
			 * what to do?  This is a pretty serious error.
			 *
			 * Since we haven't updated the RX ring head to
			 * consume any of the packets, signal the error
			 * to the user and just leave the RX ring alone.
			 *
			 * This seems the best behavior because this allows
			 * a user of the LDC layer to start with a small
			 * RX buffer for ldc_read() calls and use -EMSGSIZE
			 * as a cue to enlarge it's read buffer.
			 */
			err = -EMSGSIZE;
			break;
		}

		/* Ok, we are gonna eat this one.  */
		new = rx_advance(lp, new);

		memcpy(buf,
		       (lp->cfg.mode == LDC_MODE_UNRELIABLE ?
			p->u.u_data : p->u.r.r_data), pkt_len);
		buf += pkt_len;
		copied += pkt_len;

		if (p->env & LDC_STOP)
			break;

no_data:
		if (new == lp->rx_tail) {
			err = rx_data_wait(lp, new);
			if (err)
				break;
		}
	}

	if (!err)
		err = rx_set_head(lp, new);

	if (err && first_frag)
		lp->rcv_nxt = first_frag->seqid - 1;

	if (!err) {
		err = copied;
		if (err > 0 && lp->cfg.mode != LDC_MODE_UNRELIABLE)
			send_data_ack(lp);
	}

	return err;
}

static const struct ldc_mode_ops nonraw_ops = {
	.write		=	write_nonraw,
	.read		=	read_nonraw,
};

static int write_stream(struct ldc_channel *lp, const void *buf,
			unsigned int size)
{
	if (size > lp->cfg.mtu)
		size = lp->cfg.mtu;
	return write_nonraw(lp, buf, size);
}

static int read_stream(struct ldc_channel *lp, void *buf, unsigned int size)
{
	if (!lp->mssbuf_len) {
		int err = read_nonraw(lp, lp->mssbuf, lp->cfg.mtu);
		if (err < 0)
			return err;

		lp->mssbuf_len = err;
		lp->mssbuf_off = 0;
	}

	if (size > lp->mssbuf_len)
		size = lp->mssbuf_len;
	memcpy(buf, lp->mssbuf + lp->mssbuf_off, size);

	lp->mssbuf_off += size;
	lp->mssbuf_len -= size;

	return size;
}

static const struct ldc_mode_ops stream_ops = {
	.write		=	write_stream,
	.read		=	read_stream,
};

int ldc_write(struct ldc_channel *lp, const void *buf, unsigned int size)
{
	unsigned long flags;
	int err;

	if (!buf)
		return -EINVAL;

	if (!size)
		return 0;

	spin_lock_irqsave(&lp->lock, flags);

	if (lp->hs_state != LDC_HS_COMPLETE)
		err = -ENOTCONN;
	else
		err = lp->mops->write(lp, buf, size);

	spin_unlock_irqrestore(&lp->lock, flags);

	return err;
}
EXPORT_SYMBOL(ldc_write);

int ldc_read(struct ldc_channel *lp, void *buf, unsigned int size)
{
	unsigned long flags;
	int err;

	if (!buf)
		return -EINVAL;

	if (!size)
		return 0;

	spin_lock_irqsave(&lp->lock, flags);

	if (lp->hs_state != LDC_HS_COMPLETE)
		err = -ENOTCONN;
	else
		err = lp->mops->read(lp, buf, size);

	spin_unlock_irqrestore(&lp->lock, flags);

	return err;
}
EXPORT_SYMBOL(ldc_read);

static long arena_alloc(struct ldc_iommu *iommu, unsigned long npages)
{
	struct iommu_arena *arena = &iommu->arena;
	unsigned long n, start, end, limit;
	int pass;

	limit = arena->limit;
	start = arena->hint;
	pass = 0;

again:
	n = bitmap_find_next_zero_area(arena->map, limit, start, npages, 0);
	end = n + npages;
	if (unlikely(end >= limit)) {
		if (likely(pass < 1)) {
			limit = start;
			start = 0;
			pass++;
			goto again;
		} else {
			/* Scanned the whole thing, give up. */
			return -1;
		}
	}
	bitmap_set(arena->map, n, npages);

	arena->hint = end;

	return n;
}

#define COOKIE_PGSZ_CODE	0xf000000000000000ULL
#define COOKIE_PGSZ_CODE_SHIFT	60ULL

static u64 pagesize_code(void)
{
	switch (PAGE_SIZE) {
	default:
	case (8ULL * 1024ULL):
		return 0;
	case (64ULL * 1024ULL):
		return 1;
	case (512ULL * 1024ULL):
		return 2;
	case (4ULL * 1024ULL * 1024ULL):
		return 3;
	case (32ULL * 1024ULL * 1024ULL):
		return 4;
	case (256ULL * 1024ULL * 1024ULL):
		return 5;
	}
}

static u64 make_cookie(u64 index, u64 pgsz_code, u64 page_offset)
{
	return ((pgsz_code << COOKIE_PGSZ_CODE_SHIFT) |
		(index << PAGE_SHIFT) |
		page_offset);
}

static u64 cookie_to_index(u64 cookie, unsigned long *shift)
{
	u64 szcode = cookie >> COOKIE_PGSZ_CODE_SHIFT;

	cookie &= ~COOKIE_PGSZ_CODE;

	*shift = szcode * 3;

	return (cookie >> (13ULL + (szcode * 3ULL)));
}

static struct ldc_mtable_entry *alloc_npages(struct ldc_iommu *iommu,
					     unsigned long npages)
{
	long entry;

	entry = arena_alloc(iommu, npages);
	if (unlikely(entry < 0))
		return NULL;

	return iommu->page_table + entry;
}

static u64 perm_to_mte(unsigned int map_perm)
{
	u64 mte_base;

	mte_base = pagesize_code();

	if (map_perm & LDC_MAP_SHADOW) {
		if (map_perm & LDC_MAP_R)
			mte_base |= LDC_MTE_COPY_R;
		if (map_perm & LDC_MAP_W)
			mte_base |= LDC_MTE_COPY_W;
	}
	if (map_perm & LDC_MAP_DIRECT) {
		if (map_perm & LDC_MAP_R)
			mte_base |= LDC_MTE_READ;
		if (map_perm & LDC_MAP_W)
			mte_base |= LDC_MTE_WRITE;
		if (map_perm & LDC_MAP_X)
			mte_base |= LDC_MTE_EXEC;
	}
	if (map_perm & LDC_MAP_IO) {
		if (map_perm & LDC_MAP_R)
			mte_base |= LDC_MTE_IOMMU_R;
		if (map_perm & LDC_MAP_W)
			mte_base |= LDC_MTE_IOMMU_W;
	}

	return mte_base;
}

static int pages_in_region(unsigned long base, long len)
{
	int count = 0;

	do {
		unsigned long new = (base + PAGE_SIZE) & PAGE_MASK;

		len -= (new - base);
		base = new;
		count++;
	} while (len > 0);

	return count;
}

struct cookie_state {
	struct ldc_mtable_entry		*page_table;
	struct ldc_trans_cookie		*cookies;
	u64				mte_base;
	u64				prev_cookie;
	u32				pte_idx;
	u32				nc;
};

static void fill_cookies(struct cookie_state *sp, unsigned long pa,
			 unsigned long off, unsigned long len)
{
	do {
		unsigned long tlen, new = pa + PAGE_SIZE;
		u64 this_cookie;

		sp->page_table[sp->pte_idx].mte = sp->mte_base | pa;

		tlen = PAGE_SIZE;
		if (off)
			tlen = PAGE_SIZE - off;
		if (tlen > len)
			tlen = len;

		this_cookie = make_cookie(sp->pte_idx,
					  pagesize_code(), off);

		off = 0;

		if (this_cookie == sp->prev_cookie) {
			sp->cookies[sp->nc - 1].cookie_size += tlen;
		} else {
			sp->cookies[sp->nc].cookie_addr = this_cookie;
			sp->cookies[sp->nc].cookie_size = tlen;
			sp->nc++;
		}
		sp->prev_cookie = this_cookie + tlen;

		sp->pte_idx++;

		len -= tlen;
		pa = new;
	} while (len > 0);
}

static int sg_count_one(struct scatterlist *sg)
{
	unsigned long base = page_to_pfn(sg_page(sg)) << PAGE_SHIFT;
	long len = sg->length;

	if ((sg->offset | len) & (8UL - 1))
		return -EFAULT;

	return pages_in_region(base + sg->offset, len);
}

static int sg_count_pages(struct scatterlist *sg, int num_sg)
{
	int count;
	int i;

	count = 0;
	for (i = 0; i < num_sg; i++) {
		int err = sg_count_one(sg + i);
		if (err < 0)
			return err;
		count += err;
	}

	return count;
}

int ldc_map_sg(struct ldc_channel *lp,
	       struct scatterlist *sg, int num_sg,
	       struct ldc_trans_cookie *cookies, int ncookies,
	       unsigned int map_perm)
{
	unsigned long i, npages, flags;
	struct ldc_mtable_entry *base;
	struct cookie_state state;
	struct ldc_iommu *iommu;
	int err;

	if (map_perm & ~LDC_MAP_ALL)
		return -EINVAL;

	err = sg_count_pages(sg, num_sg);
	if (err < 0)
		return err;

	npages = err;
	if (err > ncookies)
		return -EMSGSIZE;

	iommu = &lp->iommu;

	spin_lock_irqsave(&iommu->lock, flags);
	base = alloc_npages(iommu, npages);
	spin_unlock_irqrestore(&iommu->lock, flags);

	if (!base)
		return -ENOMEM;

	state.page_table = iommu->page_table;
	state.cookies = cookies;
	state.mte_base = perm_to_mte(map_perm);
	state.prev_cookie = ~(u64)0;
	state.pte_idx = (base - iommu->page_table);
	state.nc = 0;

	for (i = 0; i < num_sg; i++)
		fill_cookies(&state, page_to_pfn(sg_page(&sg[i])) << PAGE_SHIFT,
			     sg[i].offset, sg[i].length);

	return state.nc;
}
EXPORT_SYMBOL(ldc_map_sg);

int ldc_map_single(struct ldc_channel *lp,
		   void *buf, unsigned int len,
		   struct ldc_trans_cookie *cookies, int ncookies,
		   unsigned int map_perm)
{
	unsigned long npages, pa, flags;
	struct ldc_mtable_entry *base;
	struct cookie_state state;
	struct ldc_iommu *iommu;

	if ((map_perm & ~LDC_MAP_ALL) || (ncookies < 1))
		return -EINVAL;

	pa = __pa(buf);
	if ((pa | len) & (8UL - 1))
		return -EFAULT;

	npages = pages_in_region(pa, len);

	iommu = &lp->iommu;

	spin_lock_irqsave(&iommu->lock, flags);
	base = alloc_npages(iommu, npages);
	spin_unlock_irqrestore(&iommu->lock, flags);

	if (!base)
		return -ENOMEM;

	state.page_table = iommu->page_table;
	state.cookies = cookies;
	state.mte_base = perm_to_mte(map_perm);
	state.prev_cookie = ~(u64)0;
	state.pte_idx = (base - iommu->page_table);
	state.nc = 0;
	fill_cookies(&state, (pa & PAGE_MASK), (pa & ~PAGE_MASK), len);
	BUG_ON(state.nc != 1);

	return state.nc;
}
EXPORT_SYMBOL(ldc_map_single);

static void free_npages(unsigned long id, struct ldc_iommu *iommu,
			u64 cookie, u64 size)
{
	struct iommu_arena *arena = &iommu->arena;
	unsigned long i, shift, index, npages;
	struct ldc_mtable_entry *base;

	npages = PAGE_ALIGN(((cookie & ~PAGE_MASK) + size)) >> PAGE_SHIFT;
	index = cookie_to_index(cookie, &shift);
	base = iommu->page_table + index;

	BUG_ON(index > arena->limit ||
	       (index + npages) > arena->limit);

	for (i = 0; i < npages; i++) {
		if (base->cookie)
			sun4v_ldc_revoke(id, cookie + (i << shift),
					 base->cookie);
		base->mte = 0;
		__clear_bit(index + i, arena->map);
	}
}

void ldc_unmap(struct ldc_channel *lp, struct ldc_trans_cookie *cookies,
	       int ncookies)
{
	struct ldc_iommu *iommu = &lp->iommu;
	unsigned long flags;
	int i;

	spin_lock_irqsave(&iommu->lock, flags);
	for (i = 0; i < ncookies; i++) {
		u64 addr = cookies[i].cookie_addr;
		u64 size = cookies[i].cookie_size;

		free_npages(lp->id, iommu, addr, size);
	}
	spin_unlock_irqrestore(&iommu->lock, flags);
}
EXPORT_SYMBOL(ldc_unmap);

int ldc_copy(struct ldc_channel *lp, int copy_dir,
	     void *buf, unsigned int len, unsigned long offset,
	     struct ldc_trans_cookie *cookies, int ncookies)
{
	unsigned int orig_len;
	unsigned long ra;
	int i;

	if (copy_dir != LDC_COPY_IN && copy_dir != LDC_COPY_OUT) {
		printk(KERN_ERR PFX "ldc_copy: ID[%lu] Bad copy_dir[%d]\n",
		       lp->id, copy_dir);
		return -EINVAL;
	}

	ra = __pa(buf);
	if ((ra | len | offset) & (8UL - 1)) {
		printk(KERN_ERR PFX "ldc_copy: ID[%lu] Unaligned buffer "
		       "ra[%lx] len[%x] offset[%lx]\n",
		       lp->id, ra, len, offset);
		return -EFAULT;
	}

	if (lp->hs_state != LDC_HS_COMPLETE ||
	    (lp->flags & LDC_FLAG_RESET)) {
		printk(KERN_ERR PFX "ldc_copy: ID[%lu] Link down hs_state[%x] "
		       "flags[%x]\n", lp->id, lp->hs_state, lp->flags);
		return -ECONNRESET;
	}

	orig_len = len;
	for (i = 0; i < ncookies; i++) {
		unsigned long cookie_raddr = cookies[i].cookie_addr;
		unsigned long this_len = cookies[i].cookie_size;
		unsigned long actual_len;

		if (unlikely(offset)) {
			unsigned long this_off = offset;

			if (this_off > this_len)
				this_off = this_len;

			offset -= this_off;
			this_len -= this_off;
			if (!this_len)
				continue;
			cookie_raddr += this_off;
		}

		if (this_len > len)
			this_len = len;

		while (1) {
			unsigned long hv_err;

			hv_err = sun4v_ldc_copy(lp->id, copy_dir,
						cookie_raddr, ra,
						this_len, &actual_len);
			if (unlikely(hv_err)) {
				printk(KERN_ERR PFX "ldc_copy: ID[%lu] "
				       "HV error %lu\n",
				       lp->id, hv_err);
				if (lp->hs_state != LDC_HS_COMPLETE ||
				    (lp->flags & LDC_FLAG_RESET))
					return -ECONNRESET;
				else
					return -EFAULT;
			}

			cookie_raddr += actual_len;
			ra += actual_len;
			len -= actual_len;
			if (actual_len == this_len)
				break;

			this_len -= actual_len;
		}

		if (!len)
			break;
	}

	/* It is caller policy what to do about short copies.
	 * For example, a networking driver can declare the
	 * packet a runt and drop it.
	 */

	return orig_len - len;
}
EXPORT_SYMBOL(ldc_copy);

void *ldc_alloc_exp_dring(struct ldc_channel *lp, unsigned int len,
			  struct ldc_trans_cookie *cookies, int *ncookies,
			  unsigned int map_perm)
{
	void *buf;
	int err;

	if (len & (8UL - 1))
		return ERR_PTR(-EINVAL);

	buf = kzalloc(len, GFP_KERNEL);
	if (!buf)
		return ERR_PTR(-ENOMEM);

	err = ldc_map_single(lp, buf, len, cookies, *ncookies, map_perm);
	if (err < 0) {
		kfree(buf);
		return ERR_PTR(err);
	}
	*ncookies = err;

	return buf;
}
EXPORT_SYMBOL(ldc_alloc_exp_dring);

void ldc_free_exp_dring(struct ldc_channel *lp, void *buf, unsigned int len,
			struct ldc_trans_cookie *cookies, int ncookies)
{
	ldc_unmap(lp, cookies, ncookies);
	kfree(buf);
}
EXPORT_SYMBOL(ldc_free_exp_dring);

static int __init ldc_init(void)
{
	unsigned long major, minor;
	struct mdesc_handle *hp;
	const u64 *v;
	int err;
	u64 mp;

	hp = mdesc_grab();
	if (!hp)
		return -ENODEV;

	mp = mdesc_node_by_name(hp, MDESC_NODE_NULL, "platform");
	err = -ENODEV;
	if (mp == MDESC_NODE_NULL)
		goto out;

	v = mdesc_get_property(hp, mp, "domaining-enabled", NULL);
	if (!v)
		goto out;

	major = 1;
	minor = 0;
	if (sun4v_hvapi_register(HV_GRP_LDOM, major, &minor)) {
		printk(KERN_INFO PFX "Could not register LDOM hvapi.\n");
		goto out;
	}

	printk(KERN_INFO "%s", version);

	if (!*v) {
		printk(KERN_INFO PFX "Domaining disabled.\n");
		goto out;
	}
	ldom_domaining_enabled = 1;
	err = 0;

out:
	mdesc_release(hp);
	return err;
}

core_initcall(ldc_init);