aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/mmc/mmci.c
blob: 9fef29d978b5e676fead18b320f86cbd19c533b4 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
/*
 *  linux/drivers/mmc/mmci.c - ARM PrimeCell MMCI PL180/1 driver
 *
 *  Copyright (C) 2003 Deep Blue Solutions, Ltd, All Rights Reserved.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */
#include <linux/config.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/init.h>
#include <linux/ioport.h>
#include <linux/device.h>
#include <linux/interrupt.h>
#include <linux/delay.h>
#include <linux/err.h>
#include <linux/highmem.h>
#include <linux/mmc/host.h>
#include <linux/mmc/protocol.h>
#include <linux/amba/bus.h>
#include <linux/clk.h>

#include <asm/cacheflush.h>
#include <asm/div64.h>
#include <asm/io.h>
#include <asm/scatterlist.h>
#include <asm/sizes.h>
#include <asm/mach/mmc.h>

#include "mmci.h"

#define DRIVER_NAME "mmci-pl18x"

#ifdef CONFIG_MMC_DEBUG
#define DBG(host,fmt,args...)	\
	pr_debug("%s: %s: " fmt, mmc_hostname(host->mmc), __func__ , args)
#else
#define DBG(host,fmt,args...)	do { } while (0)
#endif

static unsigned int fmax = 515633;

static void
mmci_request_end(struct mmci_host *host, struct mmc_request *mrq)
{
	writel(0, host->base + MMCICOMMAND);

	host->mrq = NULL;
	host->cmd = NULL;

	if (mrq->data)
		mrq->data->bytes_xfered = host->data_xfered;

	/*
	 * Need to drop the host lock here; mmc_request_done may call
	 * back into the driver...
	 */
	spin_unlock(&host->lock);
	mmc_request_done(host->mmc, mrq);
	spin_lock(&host->lock);
}

static void mmci_stop_data(struct mmci_host *host)
{
	writel(0, host->base + MMCIDATACTRL);
	writel(0, host->base + MMCIMASK1);
	host->data = NULL;
}

static void mmci_start_data(struct mmci_host *host, struct mmc_data *data)
{
	unsigned int datactrl, timeout, irqmask;
	unsigned long long clks;
	void __iomem *base;

	DBG(host, "blksz %04x blks %04x flags %08x\n",
	    1 << data->blksz_bits, data->blocks, data->flags);

	host->data = data;
	host->size = data->blocks << data->blksz_bits;
	host->data_xfered = 0;

	mmci_init_sg(host, data);

	clks = (unsigned long long)data->timeout_ns * host->cclk;
	do_div(clks, 1000000000UL);

	timeout = data->timeout_clks + (unsigned int)clks;

	base = host->base;
	writel(timeout, base + MMCIDATATIMER);
	writel(host->size, base + MMCIDATALENGTH);

	datactrl = MCI_DPSM_ENABLE | data->blksz_bits << 4;
	if (data->flags & MMC_DATA_READ) {
		datactrl |= MCI_DPSM_DIRECTION;
		irqmask = MCI_RXFIFOHALFFULLMASK;

		/*
		 * If we have less than a FIFOSIZE of bytes to transfer,
		 * trigger a PIO interrupt as soon as any data is available.
		 */
		if (host->size < MCI_FIFOSIZE)
			irqmask |= MCI_RXDATAAVLBLMASK;
	} else {
		/*
		 * We don't actually need to include "FIFO empty" here
		 * since its implicit in "FIFO half empty".
		 */
		irqmask = MCI_TXFIFOHALFEMPTYMASK;
	}

	writel(datactrl, base + MMCIDATACTRL);
	writel(readl(base + MMCIMASK0) & ~MCI_DATAENDMASK, base + MMCIMASK0);
	writel(irqmask, base + MMCIMASK1);
}

static void
mmci_start_command(struct mmci_host *host, struct mmc_command *cmd, u32 c)
{
	void __iomem *base = host->base;

	DBG(host, "op %02x arg %08x flags %08x\n",
	    cmd->opcode, cmd->arg, cmd->flags);

	if (readl(base + MMCICOMMAND) & MCI_CPSM_ENABLE) {
		writel(0, base + MMCICOMMAND);
		udelay(1);
	}

	c |= cmd->opcode | MCI_CPSM_ENABLE;
	if (cmd->flags & MMC_RSP_PRESENT) {
		if (cmd->flags & MMC_RSP_136)
			c |= MCI_CPSM_LONGRSP;
		c |= MCI_CPSM_RESPONSE;
	}
	if (/*interrupt*/0)
		c |= MCI_CPSM_INTERRUPT;

	host->cmd = cmd;

	writel(cmd->arg, base + MMCIARGUMENT);
	writel(c, base + MMCICOMMAND);
}

static void
mmci_data_irq(struct mmci_host *host, struct mmc_data *data,
	      unsigned int status)
{
	if (status & MCI_DATABLOCKEND) {
		host->data_xfered += 1 << data->blksz_bits;
	}
	if (status & (MCI_DATACRCFAIL|MCI_DATATIMEOUT|MCI_TXUNDERRUN|MCI_RXOVERRUN)) {
		if (status & MCI_DATACRCFAIL)
			data->error = MMC_ERR_BADCRC;
		else if (status & MCI_DATATIMEOUT)
			data->error = MMC_ERR_TIMEOUT;
		else if (status & (MCI_TXUNDERRUN|MCI_RXOVERRUN))
			data->error = MMC_ERR_FIFO;
		status |= MCI_DATAEND;

		/*
		 * We hit an error condition.  Ensure that any data
		 * partially written to a page is properly coherent.
		 */
		if (host->sg_len && data->flags & MMC_DATA_READ)
			flush_dcache_page(host->sg_ptr->page);
	}
	if (status & MCI_DATAEND) {
		mmci_stop_data(host);

		if (!data->stop) {
			mmci_request_end(host, data->mrq);
		} else {
			mmci_start_command(host, data->stop, 0);
		}
	}
}

static void
mmci_cmd_irq(struct mmci_host *host, struct mmc_command *cmd,
	     unsigned int status)
{
	void __iomem *base = host->base;

	host->cmd = NULL;

	cmd->resp[0] = readl(base + MMCIRESPONSE0);
	cmd->resp[1] = readl(base + MMCIRESPONSE1);
	cmd->resp[2] = readl(base + MMCIRESPONSE2);
	cmd->resp[3] = readl(base + MMCIRESPONSE3);

	if (status & MCI_CMDTIMEOUT) {
		cmd->error = MMC_ERR_TIMEOUT;
	} else if (status & MCI_CMDCRCFAIL && cmd->flags & MMC_RSP_CRC) {
		cmd->error = MMC_ERR_BADCRC;
	}

	if (!cmd->data || cmd->error != MMC_ERR_NONE) {
		mmci_request_end(host, cmd->mrq);
	} else if (!(cmd->data->flags & MMC_DATA_READ)) {
		mmci_start_data(host, cmd->data);
	}
}

static int mmci_pio_read(struct mmci_host *host, char *buffer, unsigned int remain)
{
	void __iomem *base = host->base;
	char *ptr = buffer;
	u32 status;

	do {
		int count = host->size - (readl(base + MMCIFIFOCNT) << 2);

		if (count > remain)
			count = remain;

		if (count <= 0)
			break;

		readsl(base + MMCIFIFO, ptr, count >> 2);

		ptr += count;
		remain -= count;

		if (remain == 0)
			break;

		status = readl(base + MMCISTATUS);
	} while (status & MCI_RXDATAAVLBL);

	return ptr - buffer;
}

static int mmci_pio_write(struct mmci_host *host, char *buffer, unsigned int remain, u32 status)
{
	void __iomem *base = host->base;
	char *ptr = buffer;

	do {
		unsigned int count, maxcnt;

		maxcnt = status & MCI_TXFIFOEMPTY ? MCI_FIFOSIZE : MCI_FIFOHALFSIZE;
		count = min(remain, maxcnt);

		writesl(base + MMCIFIFO, ptr, count >> 2);

		ptr += count;
		remain -= count;

		if (remain == 0)
			break;

		status = readl(base + MMCISTATUS);
	} while (status & MCI_TXFIFOHALFEMPTY);

	return ptr - buffer;
}

/*
 * PIO data transfer IRQ handler.
 */
static irqreturn_t mmci_pio_irq(int irq, void *dev_id, struct pt_regs *regs)
{
	struct mmci_host *host = dev_id;
	void __iomem *base = host->base;
	u32 status;

	status = readl(base + MMCISTATUS);

	DBG(host, "irq1 %08x\n", status);

	do {
		unsigned long flags;
		unsigned int remain, len;
		char *buffer;

		/*
		 * For write, we only need to test the half-empty flag
		 * here - if the FIFO is completely empty, then by
		 * definition it is more than half empty.
		 *
		 * For read, check for data available.
		 */
		if (!(status & (MCI_TXFIFOHALFEMPTY|MCI_RXDATAAVLBL)))
			break;

		/*
		 * Map the current scatter buffer.
		 */
		buffer = mmci_kmap_atomic(host, &flags) + host->sg_off;
		remain = host->sg_ptr->length - host->sg_off;

		len = 0;
		if (status & MCI_RXACTIVE)
			len = mmci_pio_read(host, buffer, remain);
		if (status & MCI_TXACTIVE)
			len = mmci_pio_write(host, buffer, remain, status);

		/*
		 * Unmap the buffer.
		 */
		mmci_kunmap_atomic(host, buffer, &flags);

		host->sg_off += len;
		host->size -= len;
		remain -= len;

		if (remain)
			break;

		/*
		 * If we were reading, and we have completed this
		 * page, ensure that the data cache is coherent.
		 */
		if (status & MCI_RXACTIVE)
			flush_dcache_page(host->sg_ptr->page);

		if (!mmci_next_sg(host))
			break;

		status = readl(base + MMCISTATUS);
	} while (1);

	/*
	 * If we're nearing the end of the read, switch to
	 * "any data available" mode.
	 */
	if (status & MCI_RXACTIVE && host->size < MCI_FIFOSIZE)
		writel(MCI_RXDATAAVLBLMASK, base + MMCIMASK1);

	/*
	 * If we run out of data, disable the data IRQs; this
	 * prevents a race where the FIFO becomes empty before
	 * the chip itself has disabled the data path, and
	 * stops us racing with our data end IRQ.
	 */
	if (host->size == 0) {
		writel(0, base + MMCIMASK1);
		writel(readl(base + MMCIMASK0) | MCI_DATAENDMASK, base + MMCIMASK0);
	}

	return IRQ_HANDLED;
}

/*
 * Handle completion of command and data transfers.
 */
static irqreturn_t mmci_irq(int irq, void *dev_id, struct pt_regs *regs)
{
	struct mmci_host *host = dev_id;
	u32 status;
	int ret = 0;

	spin_lock(&host->lock);

	do {
		struct mmc_command *cmd;
		struct mmc_data *data;

		status = readl(host->base + MMCISTATUS);
		status &= readl(host->base + MMCIMASK0);
		writel(status, host->base + MMCICLEAR);

		DBG(host, "irq0 %08x\n", status);

		data = host->data;
		if (status & (MCI_DATACRCFAIL|MCI_DATATIMEOUT|MCI_TXUNDERRUN|
			      MCI_RXOVERRUN|MCI_DATAEND|MCI_DATABLOCKEND) && data)
			mmci_data_irq(host, data, status);

		cmd = host->cmd;
		if (status & (MCI_CMDCRCFAIL|MCI_CMDTIMEOUT|MCI_CMDSENT|MCI_CMDRESPEND) && cmd)
			mmci_cmd_irq(host, cmd, status);

		ret = 1;
	} while (status);

	spin_unlock(&host->lock);

	return IRQ_RETVAL(ret);
}

static void mmci_request(struct mmc_host *mmc, struct mmc_request *mrq)
{
	struct mmci_host *host = mmc_priv(mmc);

	WARN_ON(host->mrq != NULL);

	spin_lock_irq(&host->lock);

	host->mrq = mrq;

	if (mrq->data && mrq->data->flags & MMC_DATA_READ)
		mmci_start_data(host, mrq->data);

	mmci_start_command(host, mrq->cmd, 0);

	spin_unlock_irq(&host->lock);
}

static void mmci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
{
	struct mmci_host *host = mmc_priv(mmc);
	u32 clk = 0, pwr = 0;

	DBG(host, "clock %uHz busmode %u powermode %u Vdd %u\n",
	    ios->clock, ios->bus_mode, ios->power_mode, ios->vdd);

	if (ios->clock) {
		if (ios->clock >= host->mclk) {
			clk = MCI_CLK_BYPASS;
			host->cclk = host->mclk;
		} else {
			clk = host->mclk / (2 * ios->clock) - 1;
			if (clk > 256)
				clk = 255;
			host->cclk = host->mclk / (2 * (clk + 1));
		}
		clk |= MCI_CLK_ENABLE;
	}

	if (host->plat->translate_vdd)
		pwr |= host->plat->translate_vdd(mmc_dev(mmc), ios->vdd);

	switch (ios->power_mode) {
	case MMC_POWER_OFF:
		break;
	case MMC_POWER_UP:
		pwr |= MCI_PWR_UP;
		break;
	case MMC_POWER_ON:
		pwr |= MCI_PWR_ON;
		break;
	}

	if (ios->bus_mode == MMC_BUSMODE_OPENDRAIN)
		pwr |= MCI_ROD;

	writel(clk, host->base + MMCICLOCK);

	if (host->pwr != pwr) {
		host->pwr = pwr;
		writel(pwr, host->base + MMCIPOWER);
	}
}

static struct mmc_host_ops mmci_ops = {
	.request	= mmci_request,
	.set_ios	= mmci_set_ios,
};

static void mmci_check_status(unsigned long data)
{
	struct mmci_host *host = (struct mmci_host *)data;
	unsigned int status;

	status = host->plat->status(mmc_dev(host->mmc));
	if (status ^ host->oldstat)
		mmc_detect_change(host->mmc, 0);

	host->oldstat = status;
	mod_timer(&host->timer, jiffies + HZ);
}

static int mmci_probe(struct amba_device *dev, void *id)
{
	struct mmc_platform_data *plat = dev->dev.platform_data;
	struct mmci_host *host;
	struct mmc_host *mmc;
	int ret;

	/* must have platform data */
	if (!plat) {
		ret = -EINVAL;
		goto out;
	}

	ret = amba_request_regions(dev, DRIVER_NAME);
	if (ret)
		goto out;

	mmc = mmc_alloc_host(sizeof(struct mmci_host), &dev->dev);
	if (!mmc) {
		ret = -ENOMEM;
		goto rel_regions;
	}

	host = mmc_priv(mmc);
	host->clk = clk_get(&dev->dev, "MCLK");
	if (IS_ERR(host->clk)) {
		ret = PTR_ERR(host->clk);
		host->clk = NULL;
		goto host_free;
	}

	ret = clk_enable(host->clk);
	if (ret)
		goto clk_free;

	host->plat = plat;
	host->mclk = clk_get_rate(host->clk);
	host->mmc = mmc;
	host->base = ioremap(dev->res.start, SZ_4K);
	if (!host->base) {
		ret = -ENOMEM;
		goto clk_disable;
	}

	mmc->ops = &mmci_ops;
	mmc->f_min = (host->mclk + 511) / 512;
	mmc->f_max = min(host->mclk, fmax);
	mmc->ocr_avail = plat->ocr_mask;

	/*
	 * We can do SGIO
	 */
	mmc->max_hw_segs = 16;
	mmc->max_phys_segs = NR_SG;

	/*
	 * Since we only have a 16-bit data length register, we must
	 * ensure that we don't exceed 2^16-1 bytes in a single request.
	 * Choose 64 (512-byte) sectors as the limit.
	 */
	mmc->max_sectors = 64;

	/*
	 * Set the maximum segment size.  Since we aren't doing DMA
	 * (yet) we are only limited by the data length register.
	 */
	mmc->max_seg_size = mmc->max_sectors << 9;

	spin_lock_init(&host->lock);

	writel(0, host->base + MMCIMASK0);
	writel(0, host->base + MMCIMASK1);
	writel(0xfff, host->base + MMCICLEAR);

	ret = request_irq(dev->irq[0], mmci_irq, SA_SHIRQ, DRIVER_NAME " (cmd)", host);
	if (ret)
		goto unmap;

	ret = request_irq(dev->irq[1], mmci_pio_irq, SA_SHIRQ, DRIVER_NAME " (pio)", host);
	if (ret)
		goto irq0_free;

	writel(MCI_IRQENABLE, host->base + MMCIMASK0);

	amba_set_drvdata(dev, mmc);

	mmc_add_host(mmc);

	printk(KERN_INFO "%s: MMCI rev %x cfg %02x at 0x%08lx irq %d,%d\n",
		mmc_hostname(mmc), amba_rev(dev), amba_config(dev),
		dev->res.start, dev->irq[0], dev->irq[1]);

	init_timer(&host->timer);
	host->timer.data = (unsigned long)host;
	host->timer.function = mmci_check_status;
	host->timer.expires = jiffies + HZ;
	add_timer(&host->timer);

	return 0;

 irq0_free:
	free_irq(dev->irq[0], host);
 unmap:
	iounmap(host->base);
 clk_disable:
	clk_disable(host->clk);
 clk_free:
	clk_put(host->clk);
 host_free:
	mmc_free_host(mmc);
 rel_regions:
	amba_release_regions(dev);
 out:
	return ret;
}

static int mmci_remove(struct amba_device *dev)
{
	struct mmc_host *mmc = amba_get_drvdata(dev);

	amba_set_drvdata(dev, NULL);

	if (mmc) {
		struct mmci_host *host = mmc_priv(mmc);

		del_timer_sync(&host->timer);

		mmc_remove_host(mmc);

		writel(0, host->base + MMCIMASK0);
		writel(0, host->base + MMCIMASK1);

		writel(0, host->base + MMCICOMMAND);
		writel(0, host->base + MMCIDATACTRL);

		free_irq(dev->irq[0], host);
		free_irq(dev->irq[1], host);

		iounmap(host->base);
		clk_disable(host->clk);
		clk_put(host->clk);

		mmc_free_host(mmc);

		amba_release_regions(dev);
	}

	return 0;
}

#ifdef CONFIG_PM
static int mmci_suspend(struct amba_device *dev, pm_message_t state)
{
	struct mmc_host *mmc = amba_get_drvdata(dev);
	int ret = 0;

	if (mmc) {
		struct mmci_host *host = mmc_priv(mmc);

		ret = mmc_suspend_host(mmc, state);
		if (ret == 0)
			writel(0, host->base + MMCIMASK0);
	}

	return ret;
}

static int mmci_resume(struct amba_device *dev)
{
	struct mmc_host *mmc = amba_get_drvdata(dev);
	int ret = 0;

	if (mmc) {
		struct mmci_host *host = mmc_priv(mmc);

		writel(MCI_IRQENABLE, host->base + MMCIMASK0);

		ret = mmc_resume_host(mmc);
	}

	return ret;
}
#else
#define mmci_suspend	NULL
#define mmci_resume	NULL
#endif

static struct amba_id mmci_ids[] = {
	{
		.id	= 0x00041180,
		.mask	= 0x000fffff,
	},
	{
		.id	= 0x00041181,
		.mask	= 0x000fffff,
	},
	{ 0, 0 },
};

static struct amba_driver mmci_driver = {
	.drv		= {
		.name	= DRIVER_NAME,
	},
	.probe		= mmci_probe,
	.remove		= mmci_remove,
	.suspend	= mmci_suspend,
	.resume		= mmci_resume,
	.id_table	= mmci_ids,
};

static int __init mmci_init(void)
{
	return amba_driver_register(&mmci_driver);
}

static void __exit mmci_exit(void)
{
	amba_driver_unregister(&mmci_driver);
}

module_init(mmci_init);
module_exit(mmci_exit);
module_param(fmax, uint, 0444);

MODULE_DESCRIPTION("ARM PrimeCell PL180/181 Multimedia Card Interface driver");
MODULE_LICENSE("GPL");
l kwa">case SCHED_EVENT_RUN: burn_nsecs(atom->duration); break; case SCHED_EVENT_SLEEP: if (atom->wait_sem) ret = sem_wait(atom->wait_sem); BUG_ON(ret); break; case SCHED_EVENT_WAKEUP: if (atom->wait_sem) ret = sem_post(atom->wait_sem); BUG_ON(ret); break; default: BUG_ON(1); } } static u64 get_cpu_usage_nsec_parent(void) { struct rusage ru; u64 sum; int err; err = getrusage(RUSAGE_SELF, &ru); BUG_ON(err); sum = ru.ru_utime.tv_sec*1e9 + ru.ru_utime.tv_usec*1e3; sum += ru.ru_stime.tv_sec*1e9 + ru.ru_stime.tv_usec*1e3; return sum; } static u64 get_cpu_usage_nsec_self(void) { char filename [] = "/proc/1234567890/sched"; unsigned long msecs, nsecs; char *line = NULL; u64 total = 0; size_t len = 0; ssize_t chars; FILE *file; int ret; sprintf(filename, "/proc/%d/sched", getpid()); file = fopen(filename, "r"); BUG_ON(!file); while ((chars = getline(&line, &len, file)) != -1) { ret = sscanf(line, "se.sum_exec_runtime : %ld.%06ld\n", &msecs, &nsecs); if (ret == 2) { total = msecs*1e6 + nsecs; break; } } if (line) free(line); fclose(file); return total; } static void *thread_func(void *ctx) { struct task_desc *this_task = ctx; u64 cpu_usage_0, cpu_usage_1; unsigned long i, ret; char comm2[22]; sprintf(comm2, ":%s", this_task->comm); prctl(PR_SET_NAME, comm2); again: ret = sem_post(&this_task->ready_for_work); BUG_ON(ret); ret = pthread_mutex_lock(&start_work_mutex); BUG_ON(ret); ret = pthread_mutex_unlock(&start_work_mutex); BUG_ON(ret); cpu_usage_0 = get_cpu_usage_nsec_self(); for (i = 0; i < this_task->nr_events; i++) { this_task->curr_event = i; process_sched_event(this_task, this_task->atoms[i]); } cpu_usage_1 = get_cpu_usage_nsec_self(); this_task->cpu_usage = cpu_usage_1 - cpu_usage_0; ret = sem_post(&this_task->work_done_sem); BUG_ON(ret); ret = pthread_mutex_lock(&work_done_wait_mutex); BUG_ON(ret); ret = pthread_mutex_unlock(&work_done_wait_mutex); BUG_ON(ret); goto again; } static void create_tasks(void) { struct task_desc *task; pthread_attr_t attr; unsigned long i; int err; err = pthread_attr_init(&attr); BUG_ON(err); err = pthread_attr_setstacksize(&attr, (size_t)(16*1024)); BUG_ON(err); err = pthread_mutex_lock(&start_work_mutex); BUG_ON(err); err = pthread_mutex_lock(&work_done_wait_mutex); BUG_ON(err); for (i = 0; i < nr_tasks; i++) { task = tasks[i]; sem_init(&task->sleep_sem, 0, 0); sem_init(&task->ready_for_work, 0, 0); sem_init(&task->work_done_sem, 0, 0); task->curr_event = 0; err = pthread_create(&task->thread, &attr, thread_func, task); BUG_ON(err); } } static void wait_for_tasks(void) { u64 cpu_usage_0, cpu_usage_1; struct task_desc *task; unsigned long i, ret; start_time = get_nsecs(); cpu_usage = 0; pthread_mutex_unlock(&work_done_wait_mutex); for (i = 0; i < nr_tasks; i++) { task = tasks[i]; ret = sem_wait(&task->ready_for_work); BUG_ON(ret); sem_init(&task->ready_for_work, 0, 0); } ret = pthread_mutex_lock(&work_done_wait_mutex); BUG_ON(ret); cpu_usage_0 = get_cpu_usage_nsec_parent(); pthread_mutex_unlock(&start_work_mutex); for (i = 0; i < nr_tasks; i++) { task = tasks[i]; ret = sem_wait(&task->work_done_sem); BUG_ON(ret); sem_init(&task->work_done_sem, 0, 0); cpu_usage += task->cpu_usage; task->cpu_usage = 0; } cpu_usage_1 = get_cpu_usage_nsec_parent(); if (!runavg_cpu_usage) runavg_cpu_usage = cpu_usage; runavg_cpu_usage = (runavg_cpu_usage*9 + cpu_usage)/10; parent_cpu_usage = cpu_usage_1 - cpu_usage_0; if (!runavg_parent_cpu_usage) runavg_parent_cpu_usage = parent_cpu_usage; runavg_parent_cpu_usage = (runavg_parent_cpu_usage*9 + parent_cpu_usage)/10; ret = pthread_mutex_lock(&start_work_mutex); BUG_ON(ret); for (i = 0; i < nr_tasks; i++) { task = tasks[i]; sem_init(&task->sleep_sem, 0, 0); task->curr_event = 0; } } static void run_one_test(void) { u64 T0, T1, delta, avg_delta, fluct, std_dev; T0 = get_nsecs(); wait_for_tasks(); T1 = get_nsecs(); delta = T1 - T0; sum_runtime += delta; nr_runs++; avg_delta = sum_runtime / nr_runs; if (delta < avg_delta) fluct = avg_delta - delta; else fluct = delta - avg_delta; sum_fluct += fluct; std_dev = sum_fluct / nr_runs / sqrt(nr_runs); if (!run_avg) run_avg = delta; run_avg = (run_avg*9 + delta)/10; printf("#%-3ld: %0.3f, ", nr_runs, (double)delta/1000000.0); printf("ravg: %0.2f, ", (double)run_avg/1e6); printf("cpu: %0.2f / %0.2f", (double)cpu_usage/1e6, (double)runavg_cpu_usage/1e6); #if 0 /* * rusage statistics done by the parent, these are less * accurate than the sum_exec_runtime based statistics: */ printf(" [%0.2f / %0.2f]", (double)parent_cpu_usage/1e6, (double)runavg_parent_cpu_usage/1e6); #endif printf("\n"); if (nr_sleep_corrections) printf(" (%ld sleep corrections)\n", nr_sleep_corrections); nr_sleep_corrections = 0; } static void test_calibrations(void) { u64 T0, T1; T0 = get_nsecs(); burn_nsecs(1e6); T1 = get_nsecs(); printf("the run test took %Ld nsecs\n", T1-T0); T0 = get_nsecs(); sleep_nsecs(1e6); T1 = get_nsecs(); printf("the sleep test took %Ld nsecs\n", T1-T0); } static void __cmd_replay(void) { unsigned long i; calibrate_run_measurement_overhead(); calibrate_sleep_measurement_overhead(); test_calibrations(); read_events(); printf("nr_run_events: %ld\n", nr_run_events); printf("nr_sleep_events: %ld\n", nr_sleep_events); printf("nr_wakeup_events: %ld\n", nr_wakeup_events); if (targetless_wakeups) printf("target-less wakeups: %ld\n", targetless_wakeups); if (multitarget_wakeups) printf("multi-target wakeups: %ld\n", multitarget_wakeups); if (nr_run_events_optimized) printf("run atoms optimized: %ld\n", nr_run_events_optimized); print_task_traces(); add_cross_task_wakeups(); create_tasks(); printf("------------------------------------------------------------\n"); for (i = 0; i < replay_repeat; i++) run_one_test(); } static int process_comm_event(event_t *event, unsigned long offset, unsigned long head) { struct thread *thread; thread = threads__findnew(event->comm.pid, &threads, &last_match); dump_printf("%p [%p]: perf_event_comm: %s:%d\n", (void *)(offset + head), (void *)(long)(event->header.size), event->comm.comm, event->comm.pid); if (thread == NULL || thread__set_comm(thread, event->comm.comm)) { dump_printf("problem processing perf_event_comm, skipping event.\n"); return -1; } total_comm++; return 0; } struct raw_event_sample { u32 size; char data[0]; }; #define FILL_FIELD(ptr, field, event, data) \ ptr.field = (typeof(ptr.field)) raw_field_value(event, #field, data) #define FILL_ARRAY(ptr, array, event, data) \ do { \ void *__array = raw_field_ptr(event, #array, data); \ memcpy(ptr.array, __array, sizeof(ptr.array)); \ } while(0) #define FILL_COMMON_FIELDS(ptr, event, data) \ do { \ FILL_FIELD(ptr, common_type, event, data); \ FILL_FIELD(ptr, common_flags, event, data); \ FILL_FIELD(ptr, common_preempt_count, event, data); \ FILL_FIELD(ptr, common_pid, event, data); \ FILL_FIELD(ptr, common_tgid, event, data); \ } while (0) struct trace_switch_event { u32 size; u16 common_type; u8 common_flags; u8 common_preempt_count; u32 common_pid; u32 common_tgid; char prev_comm[16]; u32 prev_pid; u32 prev_prio; u64 prev_state; char next_comm[16]; u32 next_pid; u32 next_prio; }; struct trace_runtime_event { u32 size; u16 common_type; u8 common_flags; u8 common_preempt_count; u32 common_pid; u32 common_tgid; char comm[16]; u32 pid; u64 runtime; u64 vruntime; }; struct trace_wakeup_event { u32 size; u16 common_type; u8 common_flags; u8 common_preempt_count; u32 common_pid; u32 common_tgid; char comm[16]; u32 pid; u32 prio; u32 success; u32 cpu; }; struct trace_fork_event { u32 size; u16 common_type; u8 common_flags; u8 common_preempt_count; u32 common_pid; u32 common_tgid; char parent_comm[16]; u32 parent_pid; char child_comm[16]; u32 child_pid; }; struct trace_sched_handler { void (*switch_event)(struct trace_switch_event *, struct event *, int cpu, u64 timestamp, struct thread *thread); void (*runtime_event)(struct trace_runtime_event *, struct event *, int cpu, u64 timestamp, struct thread *thread); void (*wakeup_event)(struct trace_wakeup_event *, struct event *, int cpu, u64 timestamp, struct thread *thread); void (*fork_event)(struct trace_fork_event *, struct event *, int cpu, u64 timestamp, struct thread *thread); }; static void replay_wakeup_event(struct trace_wakeup_event *wakeup_event, struct event *event, int cpu __used, u64 timestamp __used, struct thread *thread __used) { struct task_desc *waker, *wakee; if (verbose) { printf("sched_wakeup event %p\n", event); printf(" ... pid %d woke up %s/%d\n", wakeup_event->common_pid, wakeup_event->comm, wakeup_event->pid); } waker = register_pid(wakeup_event->common_pid, "<unknown>"); wakee = register_pid(wakeup_event->pid, wakeup_event->comm); add_sched_event_wakeup(waker, timestamp, wakee); } static u64 cpu_last_switched[MAX_CPUS]; static void replay_switch_event(struct trace_switch_event *switch_event, struct event *event, int cpu, u64 timestamp, struct thread *thread __used) { struct task_desc *prev, *next; u64 timestamp0; s64 delta; if (verbose) printf("sched_switch event %p\n", event); if (cpu >= MAX_CPUS || cpu < 0) return; timestamp0 = cpu_last_switched[cpu]; if (timestamp0) delta = timestamp - timestamp0; else delta = 0; if (delta < 0) die("hm, delta: %Ld < 0 ?\n", delta); if (verbose) { printf(" ... switch from %s/%d to %s/%d [ran %Ld nsecs]\n", switch_event->prev_comm, switch_event->prev_pid, switch_event->next_comm, switch_event->next_pid, delta); } prev = register_pid(switch_event->prev_pid, switch_event->prev_comm); next = register_pid(switch_event->next_pid, switch_event->next_comm); cpu_last_switched[cpu] = timestamp; add_sched_event_run(prev, timestamp, delta); add_sched_event_sleep(prev, timestamp, switch_event->prev_state); } static void replay_fork_event(struct trace_fork_event *fork_event, struct event *event, int cpu __used, u64 timestamp __used, struct thread *thread __used) { if (verbose) { printf("sched_fork event %p\n", event); printf("... parent: %s/%d\n", fork_event->parent_comm, fork_event->parent_pid); printf("... child: %s/%d\n", fork_event->child_comm, fork_event->child_pid); } register_pid(fork_event->parent_pid, fork_event->parent_comm); register_pid(fork_event->child_pid, fork_event->child_comm); } static struct trace_sched_handler replay_ops = { .wakeup_event = replay_wakeup_event, .switch_event = replay_switch_event, .fork_event = replay_fork_event, }; struct sort_dimension { const char *name; sort_fn_t cmp; struct list_head list; }; static LIST_HEAD(cmp_pid); static int thread_lat_cmp(struct list_head *list, struct work_atoms *l, struct work_atoms *r) { struct sort_dimension *sort; int ret = 0; BUG_ON(list_empty(list)); list_for_each_entry(sort, list, list) { ret = sort->cmp(l, r); if (ret) return ret; } return ret; } static struct work_atoms * thread_atoms_search(struct rb_root *root, struct thread *thread, struct list_head *sort_list) { struct rb_node *node = root->rb_node; struct work_atoms key = { .thread = thread }; while (node) { struct work_atoms *atoms; int cmp; atoms = container_of(node, struct work_atoms, node); cmp = thread_lat_cmp(sort_list, &key, atoms); if (cmp > 0) node = node->rb_left; else if (cmp < 0) node = node->rb_right; else { BUG_ON(thread != atoms->thread); return atoms; } } return NULL; } static void __thread_latency_insert(struct rb_root *root, struct work_atoms *data, struct list_head *sort_list) { struct rb_node **new = &(root->rb_node), *parent = NULL; while (*new) { struct work_atoms *this; int cmp; this = container_of(*new, struct work_atoms, node); parent = *new; cmp = thread_lat_cmp(sort_list, data, this); if (cmp > 0) new = &((*new)->rb_left); else new = &((*new)->rb_right); } rb_link_node(&data->node, parent, new); rb_insert_color(&data->node, root); } static void thread_atoms_insert(struct thread *thread) { struct work_atoms *atoms; atoms = calloc(sizeof(*atoms), 1); if (!atoms) die("No memory"); atoms->thread = thread; INIT_LIST_HEAD(&atoms->work_list); __thread_latency_insert(&atom_root, atoms, &cmp_pid); } static void latency_fork_event(struct trace_fork_event *fork_event __used, struct event *event __used, int cpu __used, u64 timestamp __used, struct thread *thread __used) { /* should insert the newcomer */ } __used static char sched_out_state(struct trace_switch_event *switch_event) { const char *str = TASK_STATE_TO_CHAR_STR; return str[switch_event->prev_state]; } static void add_sched_out_event(struct work_atoms *atoms, char run_state, u64 timestamp) { struct work_atom *atom; atom = calloc(sizeof(*atom), 1); if (!atom) die("Non memory"); atom->sched_out_time = timestamp; if (run_state == 'R') { atom->state = THREAD_WAIT_CPU; atom->wake_up_time = atom->sched_out_time; } list_add_tail(&atom->list, &atoms->work_list); } static void add_runtime_event(struct work_atoms *atoms, u64 delta, u64 timestamp __used) { struct work_atom *atom; BUG_ON(list_empty(&atoms->work_list)); atom = list_entry(atoms->work_list.prev, struct work_atom, list); atom->runtime += delta; atoms->total_runtime += delta; } static void add_sched_in_event(struct work_atoms *atoms, u64 timestamp) { struct work_atom *atom; u64 delta; if (list_empty(&atoms->work_list)) return; atom = list_entry(atoms->work_list.prev, struct work_atom, list); if (atom->state != THREAD_WAIT_CPU) return; if (timestamp < atom->wake_up_time) { atom->state = THREAD_IGNORE; return; } atom->state = THREAD_SCHED_IN; atom->sched_in_time = timestamp; delta = atom->sched_in_time - atom->wake_up_time; atoms->total_lat += delta; if (delta > atoms->max_lat) atoms->max_lat = delta; atoms->nb_atoms++; } static void latency_switch_event(struct trace_switch_event *switch_event, struct event *event __used, int cpu, u64 timestamp, struct thread *thread __used) { struct work_atoms *out_events, *in_events; struct thread *sched_out, *sched_in; u64 timestamp0; s64 delta; BUG_ON(cpu >= MAX_CPUS || cpu < 0); timestamp0 = cpu_last_switched[cpu]; cpu_last_switched[cpu] = timestamp; if (timestamp0) delta = timestamp - timestamp0; else delta = 0; if (delta < 0) die("hm, delta: %Ld < 0 ?\n", delta); sched_out = threads__findnew(switch_event->prev_pid, &threads, &last_match); sched_in = threads__findnew(switch_event->next_pid, &threads, &last_match); out_events = thread_atoms_search(&atom_root, sched_out, &cmp_pid); if (!out_events) { thread_atoms_insert(sched_out); out_events = thread_atoms_search(&atom_root, sched_out, &cmp_pid); if (!out_events) die("out-event: Internal tree error"); } add_sched_out_event(out_events, sched_out_state(switch_event), timestamp); in_events = thread_atoms_search(&atom_root, sched_in, &cmp_pid); if (!in_events) { thread_atoms_insert(sched_in); in_events = thread_atoms_search(&atom_root, sched_in, &cmp_pid); if (!in_events) die("in-event: Internal tree error"); /* * Take came in we have not heard about yet, * add in an initial atom in runnable state: */ add_sched_out_event(in_events, 'R', timestamp); } add_sched_in_event(in_events, timestamp); } static void latency_runtime_event(struct trace_runtime_event *runtime_event, struct event *event __used, int cpu, u64 timestamp, struct thread *this_thread __used) { struct work_atoms *atoms; struct thread *thread; BUG_ON(cpu >= MAX_CPUS || cpu < 0); thread = threads__findnew(runtime_event->pid, &threads, &last_match); atoms = thread_atoms_search(&atom_root, thread, &cmp_pid); if (!atoms) { thread_atoms_insert(thread); atoms = thread_atoms_search(&atom_root, thread, &cmp_pid); if (!atoms) die("in-event: Internal tree error"); add_sched_out_event(atoms, 'R', timestamp); } add_runtime_event(atoms, runtime_event->runtime, timestamp); } static void latency_wakeup_event(struct trace_wakeup_event *wakeup_event, struct event *__event __used, int cpu __used, u64 timestamp, struct thread *thread __used) { struct work_atoms *atoms; struct work_atom *atom; struct thread *wakee; /* Note for later, it may be interesting to observe the failing cases */ if (!wakeup_event->success) return; wakee = threads__findnew(wakeup_event->pid, &threads, &last_match); atoms = thread_atoms_search(&atom_root, wakee, &cmp_pid); if (!atoms) { thread_atoms_insert(wakee); atoms = thread_atoms_search(&atom_root, wakee, &cmp_pid); if (!atoms) die("wakeup-event: Internal tree error"); add_sched_out_event(atoms, 'S', timestamp); } BUG_ON(list_empty(&atoms->work_list)); atom = list_entry(atoms->work_list.prev, struct work_atom, list); if (atom->state != THREAD_SLEEPING) nr_state_machine_bugs++; nr_timestamps++; if (atom->sched_out_time > timestamp) { nr_unordered_timestamps++; return; } atom->state = THREAD_WAIT_CPU; atom->wake_up_time = timestamp; } static struct trace_sched_handler lat_ops = { .wakeup_event = latency_wakeup_event, .switch_event = latency_switch_event, .runtime_event = latency_runtime_event, .fork_event = latency_fork_event, }; static void output_lat_thread(struct work_atoms *work_list) { int i; int ret; u64 avg; if (!work_list->nb_atoms) return; /* * Ignore idle threads: */ if (!strcmp(work_list->thread->comm, "swapper")) return; all_runtime += work_list->total_runtime; all_count += work_list->nb_atoms; ret = printf(" %s:%d ", work_list->thread->comm, work_list->thread->pid); for (i = 0; i < 24 - ret; i++) printf(" "); avg = work_list->total_lat / work_list->nb_atoms; printf("|%11.3f ms |%9llu | avg:%9.3f ms | max:%9.3f ms |\n", (double)work_list->total_runtime / 1e6, work_list->nb_atoms, (double)avg / 1e6, (double)work_list->max_lat / 1e6); } static int pid_cmp(struct work_atoms *l, struct work_atoms *r) { if (l->thread->pid < r->thread->pid) return -1; if (l->thread->pid > r->thread->pid) return 1; return 0; } static struct sort_dimension pid_sort_dimension = { .name = "pid", .cmp = pid_cmp, }; static int avg_cmp(struct work_atoms *l, struct work_atoms *r) { u64 avgl, avgr; if (!l->nb_atoms) return -1; if (!r->nb_atoms) return 1; avgl = l->total_lat / l->nb_atoms; avgr = r->total_lat / r->nb_atoms; if (avgl < avgr) return -1; if (avgl > avgr) return 1; return 0; } static struct sort_dimension avg_sort_dimension = { .name = "avg", .cmp = avg_cmp, }; static int max_cmp(struct work_atoms *l, struct work_atoms *r) { if (l->max_lat < r->max_lat) return -1; if (l->max_lat > r->max_lat) return 1; return 0; } static struct sort_dimension max_sort_dimension = { .name = "max", .cmp = max_cmp, }; static int switch_cmp(struct work_atoms *l, struct work_atoms *r) { if (l->nb_atoms < r->nb_atoms) return -1; if (l->nb_atoms > r->nb_atoms) return 1; return 0; } static struct sort_dimension switch_sort_dimension = { .name = "switch", .cmp = switch_cmp, }; static int runtime_cmp(struct work_atoms *l, struct work_atoms *r) { if (l->total_runtime < r->total_runtime) return -1; if (l->total_runtime > r->total_runtime) return 1; return 0; } static struct sort_dimension runtime_sort_dimension = { .name = "runtime", .cmp = runtime_cmp, }; static struct sort_dimension *available_sorts[] = { &pid_sort_dimension, &avg_sort_dimension, &max_sort_dimension, &switch_sort_dimension, &runtime_sort_dimension, }; #define NB_AVAILABLE_SORTS (int)(sizeof(available_sorts) / sizeof(struct sort_dimension *)) static LIST_HEAD(sort_list); static int sort_dimension__add(char *tok, struct list_head *list) { int i; for (i = 0; i < NB_AVAILABLE_SORTS; i++) { if (!strcmp(available_sorts[i]->name, tok)) { list_add_tail(&available_sorts[i]->list, list); return 0; } } return -1; } static void setup_sorting(void); static void sort_lat(void) { struct rb_node *node; for (;;) { struct work_atoms *data; node = rb_first(&atom_root); if (!node) break; rb_erase(node, &atom_root); data = rb_entry(node, struct work_atoms, node); __thread_latency_insert(&sorted_atom_root, data, &sort_list); } } static void __cmd_lat(void) { struct rb_node *next; setup_pager(); read_events(); sort_lat(); printf("\n -----------------------------------------------------------------------------------------\n"); printf(" Task | Runtime ms | Switches | Average delay ms | Maximum delay ms |\n"); printf(" -----------------------------------------------------------------------------------------\n"); next = rb_first(&sorted_atom_root); while (next) { struct work_atoms *work_list; work_list = rb_entry(next, struct work_atoms, node); output_lat_thread(work_list); next = rb_next(next); } printf(" -----------------------------------------------------------------------------------------\n"); printf(" TOTAL: |%11.3f ms |%9Ld |\n", (double)all_runtime/1e6, all_count); printf(" ---------------------------------------------------\n"); if (nr_unordered_timestamps && nr_timestamps) { printf(" INFO: %.3f%% unordered timestamps (%ld out of %ld)\n", (double)nr_unordered_timestamps/(double)nr_timestamps*100.0, nr_unordered_timestamps, nr_timestamps); } else { } if (nr_lost_events && nr_events) { printf(" INFO: %.3f%% lost events (%ld out of %ld, in %ld chunks)\n", (double)nr_lost_events/(double)nr_events*100.0, nr_lost_events, nr_events, nr_lost_chunks); } if (nr_state_machine_bugs && nr_timestamps) { printf(" INFO: %.3f%% state machine bugs (%ld out of %ld)", (double)nr_state_machine_bugs/(double)nr_timestamps*100.0, nr_state_machine_bugs, nr_timestamps); if (nr_lost_events) printf(" (due to lost events?)"); printf("\n"); } if (nr_context_switch_bugs && nr_timestamps) { printf(" INFO: %.3f%% context switch bugs (%ld out of %ld)", (double)nr_context_switch_bugs/(double)nr_timestamps*100.0, nr_context_switch_bugs, nr_timestamps); if (nr_lost_events) printf(" (due to lost events?)"); printf("\n"); } printf("\n"); } static struct trace_sched_handler *trace_handler; static void process_sched_wakeup_event(struct raw_event_sample *raw, struct event *event, int cpu __used, u64 timestamp __used, struct thread *thread __used) { struct trace_wakeup_event wakeup_event; FILL_COMMON_FIELDS(wakeup_event, event, raw->data); FILL_ARRAY(wakeup_event, comm, event, raw->data); FILL_FIELD(wakeup_event, pid, event, raw->data); FILL_FIELD(wakeup_event, prio, event, raw->data); FILL_FIELD(wakeup_event, success, event, raw->data); FILL_FIELD(wakeup_event, cpu, event, raw->data); trace_handler->wakeup_event(&wakeup_event, event, cpu, timestamp, thread); } /* * Track the current task - that way we can know whether there's any * weird events, such as a task being switched away that is not current. */ static u32 curr_pid[MAX_CPUS] = { [0 ... MAX_CPUS-1] = -1 }; static void process_sched_switch_event(struct raw_event_sample *raw, struct event *event, int cpu, u64 timestamp __used, struct thread *thread __used) { struct trace_switch_event switch_event; FILL_COMMON_FIELDS(switch_event, event, raw->data); FILL_ARRAY(switch_event, prev_comm, event, raw->data); FILL_FIELD(switch_event, prev_pid, event, raw->data); FILL_FIELD(switch_event, prev_prio, event, raw->data); FILL_FIELD(switch_event, prev_state, event, raw->data); FILL_ARRAY(switch_event, next_comm, event, raw->data); FILL_FIELD(switch_event, next_pid, event, raw->data); FILL_FIELD(switch_event, next_prio, event, raw->data); if (curr_pid[cpu] != (u32)-1) { /* * Are we trying to switch away a PID that is * not current? */ if (curr_pid[cpu] != switch_event.prev_pid) nr_context_switch_bugs++; } curr_pid[cpu] = switch_event.next_pid; trace_handler->switch_event(&switch_event, event, cpu, timestamp, thread); } static void process_sched_runtime_event(struct raw_event_sample *raw, struct event *event, int cpu __used, u64 timestamp __used, struct thread *thread __used) { struct trace_runtime_event runtime_event; FILL_ARRAY(runtime_event, comm, event, raw->data); FILL_FIELD(runtime_event, pid, event, raw->data); FILL_FIELD(runtime_event, runtime, event, raw->data); FILL_FIELD(runtime_event, vruntime, event, raw->data); trace_handler->runtime_event(&runtime_event, event, cpu, timestamp, thread); } static void process_sched_fork_event(struct raw_event_sample *raw, struct event *event, int cpu __used, u64 timestamp __used, struct thread *thread __used) { struct trace_fork_event fork_event; FILL_COMMON_FIELDS(fork_event, event, raw->data); FILL_ARRAY(fork_event, parent_comm, event, raw->data); FILL_FIELD(fork_event, parent_pid, event, raw->data); FILL_ARRAY(fork_event, child_comm, event, raw->data); FILL_FIELD(fork_event, child_pid, event, raw->data); trace_handler->fork_event(&fork_event, event, cpu, timestamp, thread); } static void process_sched_exit_event(struct event *event, int cpu __used, u64 timestamp __used, struct thread *thread __used) { if (verbose) printf("sched_exit event %p\n", event); } static void process_raw_event(event_t *raw_event __used, void *more_data, int cpu, u64 timestamp, struct thread *thread) { struct raw_event_sample *raw = more_data; struct event *event; int type; type = trace_parse_common_type(raw->data); event = trace_find_event(type); if (!strcmp(event->name, "sched_switch")) process_sched_switch_event(raw, event, cpu, timestamp, thread); if (!strcmp(event->name, "sched_stat_runtime")) process_sched_runtime_event(raw, event, cpu, timestamp, thread); if (!strcmp(event->name, "sched_wakeup")) process_sched_wakeup_event(raw, event, cpu, timestamp, thread); if (!strcmp(event->name, "sched_wakeup_new")) process_sched_wakeup_event(raw, event, cpu, timestamp, thread); if (!strcmp(event->name, "sched_process_fork")) process_sched_fork_event(raw, event, cpu, timestamp, thread); if (!strcmp(event->name, "sched_process_exit")) process_sched_exit_event(event, cpu, timestamp, thread); } static int process_sample_event(event_t *event, unsigned long offset, unsigned long head) { char level; int show = 0; struct dso *dso = NULL; struct thread *thread; u64 ip = event->ip.ip; u64 timestamp = -1; u32 cpu = -1; u64 period = 1; void *more_data = event->ip.__more_data; int cpumode; thread = threads__findnew(event->ip.pid, &threads, &last_match); if (sample_type & PERF_SAMPLE_TIME) { timestamp = *(u64 *)more_data; more_data += sizeof(u64); } if (sample_type & PERF_SAMPLE_CPU) { cpu = *(u32 *)more_data; more_data += sizeof(u32); more_data += sizeof(u32); /* reserved */ } if (sample_type & PERF_SAMPLE_PERIOD) { period = *(u64 *)more_data; more_data += sizeof(u64); } dump_printf("%p [%p]: PERF_EVENT_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n", (void *)(offset + head), (void *)(long)(event->header.size), event->header.misc, event->ip.pid, event->ip.tid, (void *)(long)ip, (long long)period); dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid); if (thread == NULL) { eprintf("problem processing %d event, skipping it.\n", event->header.type); return -1; } cpumode = event->header.misc & PERF_EVENT_MISC_CPUMODE_MASK; if (cpumode == PERF_EVENT_MISC_KERNEL) { show = SHOW_KERNEL; level = 'k'; dso = kernel_dso; dump_printf(" ...... dso: %s\n", dso->name); } else if (cpumode == PERF_EVENT_MISC_USER) { show = SHOW_USER; level = '.'; } else { show = SHOW_HV; level = 'H'; dso = hypervisor_dso; dump_printf(" ...... dso: [hypervisor]\n"); } if (sample_type & PERF_SAMPLE_RAW) process_raw_event(event, more_data, cpu, timestamp, thread); return 0; } static int process_event(event_t *event, unsigned long offset, unsigned long head) { trace_event(event); nr_events++; switch (event->header.type) { case PERF_EVENT_MMAP: return 0; case PERF_EVENT_LOST: nr_lost_chunks++; nr_lost_events += event->lost.lost; return 0; case PERF_EVENT_COMM: return process_comm_event(event, offset, head); case PERF_EVENT_EXIT ... PERF_EVENT_READ: return 0; case PERF_EVENT_SAMPLE: return process_sample_event(event, offset, head); case PERF_EVENT_MAX: default: return -1; } return 0; } static int read_events(void) { int ret, rc = EXIT_FAILURE; unsigned long offset = 0; unsigned long head = 0; struct stat perf_stat; event_t *event; uint32_t size; char *buf; trace_report(); register_idle_thread(&threads, &last_match); input = open(input_name, O_RDONLY); if (input < 0) { perror("failed to open file"); exit(-1); } ret = fstat(input, &perf_stat); if (ret < 0) { perror("failed to stat file"); exit(-1); } if (!perf_stat.st_size) { fprintf(stderr, "zero-sized file, nothing to do!\n"); exit(0); } header = perf_header__read(input); head = header->data_offset; sample_type = perf_header__sample_type(header); if (!(sample_type & PERF_SAMPLE_RAW)) die("No trace sample to read. Did you call perf record " "without -R?"); if (load_kernel() < 0) { perror("failed to load kernel symbols"); return EXIT_FAILURE; } remap: buf = (char *)mmap(NULL, page_size * mmap_window, PROT_READ, MAP_SHARED, input, offset); if (buf == MAP_FAILED) { perror("failed to mmap file"); exit(-1); } more: event = (event_t *)(buf + head); size = event->header.size; if (!size) size = 8; if (head + event->header.size >= page_size * mmap_window) { unsigned long shift = page_size * (head / page_size); int res; res = munmap(buf, page_size * mmap_window); assert(res == 0); offset += shift; head -= shift; goto remap; } size = event->header.size; if (!size || process_event(event, offset, head) < 0) { /* * assume we lost track of the stream, check alignment, and * increment a single u64 in the hope to catch on again 'soon'. */ if (unlikely(head & 7)) head &= ~7ULL; size = 8; } head += size; if (offset + head < (unsigned long)perf_stat.st_size) goto more; rc = EXIT_SUCCESS; close(input); return rc; } static const char * const sched_usage[] = { "perf sched [<options>] {record|latency|replay|trace}", NULL }; static const struct option sched_options[] = { OPT_BOOLEAN('v', "verbose", &verbose, "be more verbose (show symbol address, etc)"), OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace, "dump raw trace in ASCII"), OPT_END() }; static const char * const latency_usage[] = { "perf sched latency [<options>]", NULL }; static const struct option latency_options[] = { OPT_STRING('s', "sort", &sort_order, "key[,key2...]", "sort by key(s): runtime, switch, avg, max"), OPT_BOOLEAN('v', "verbose", &verbose, "be more verbose (show symbol address, etc)"), OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace, "dump raw trace in ASCII"), OPT_END() }; static const char * const replay_usage[] = { "perf sched replay [<options>]", NULL }; static const struct option replay_options[] = { OPT_INTEGER('r', "repeat", &replay_repeat, "repeat the workload replay N times (-1: infinite)"), OPT_BOOLEAN('v', "verbose", &verbose, "be more verbose (show symbol address, etc)"), OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace, "dump raw trace in ASCII"), OPT_END() }; static void setup_sorting(void) { char *tmp, *tok, *str = strdup(sort_order); for (tok = strtok_r(str, ", ", &tmp); tok; tok = strtok_r(NULL, ", ", &tmp)) { if (sort_dimension__add(tok, &sort_list) < 0) { error("Unknown --sort key: `%s'", tok); usage_with_options(latency_usage, latency_options); } } free(str); sort_dimension__add((char *)"pid", &cmp_pid); } static const char *record_args[] = { "record", "-a", "-R", "-M", "-f", "-m", "1024", "-c", "1", "-e", "sched:sched_switch:r", "-e", "sched:sched_stat_wait:r", "-e", "sched:sched_stat_sleep:r", "-e", "sched:sched_stat_iowait:r", "-e", "sched:sched_stat_runtime:r", "-e", "sched:sched_process_exit:r", "-e", "sched:sched_process_fork:r", "-e", "sched:sched_wakeup:r", "-e", "sched:sched_migrate_task:r", }; static int __cmd_record(int argc, const char **argv) { unsigned int rec_argc, i, j; const char **rec_argv; rec_argc = ARRAY_SIZE(record_args) + argc - 1; rec_argv = calloc(rec_argc + 1, sizeof(char *)); for (i = 0; i < ARRAY_SIZE(record_args); i++) rec_argv[i] = strdup(record_args[i]); for (j = 1; j < (unsigned int)argc; j++, i++) rec_argv[i] = argv[j]; BUG_ON(i != rec_argc); return cmd_record(i, rec_argv, NULL); } int cmd_sched(int argc, const char **argv, const char *prefix __used) { symbol__init(); page_size = getpagesize(); argc = parse_options(argc, argv, sched_options, sched_usage, PARSE_OPT_STOP_AT_NON_OPTION); if (!argc) usage_with_options(sched_usage, sched_options); if (!strncmp(argv[0], "rec", 3)) { return __cmd_record(argc, argv); } else if (!strncmp(argv[0], "lat", 3)) { trace_handler = &lat_ops; if (argc > 1) { argc = parse_options(argc, argv, latency_options, latency_usage, 0); if (argc) usage_with_options(latency_usage, latency_options); } setup_sorting(); __cmd_lat(); } else if (!strncmp(argv[0], "rep", 3)) { trace_handler = &replay_ops; if (argc) { argc = parse_options(argc, argv, replay_options, replay_usage, 0); if (argc) usage_with_options(replay_usage, replay_options); } __cmd_replay(); } else if (!strcmp(argv[0], "trace")) { /* * Aliased to 'perf trace' for now: */ return cmd_trace(argc, argv, prefix); } else { usage_with_options(sched_usage, sched_options); } return 0; }