drivers/gpu/nvgpu/gk20a/css_gr_gk20a.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759

/*
 * GK20A Cycle stats snapshots support (subsystem for gr_gk20a).
 *
 * Copyright (c) 2015, NVIDIA CORPORATION.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
 * version 2, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
 * more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#include <linux/bitops.h>
#include <linux/dma-mapping.h>
#include <linux/dma-buf.h>
#include <linux/mutex.h>
#include <linux/vmalloc.h>

#include "gk20a.h"
#include "hw_perf_gk20a.h"
#include "hw_mc_gk20a.h"


/* cycle stats fifo header (must match NvSnapshotBufferFifo) */
struct gk20a_cs_snapshot_fifo {
	/* layout description of the buffer */
	u32	start;
	u32	end;

	/* snafu bits */
	u32	hw_overflow_events_occured;
	u32	sw_overflow_events_occured;

	/* the kernel copies new entries to put and
	 * increment the put++. if put == get then
	 * overflowEventsOccured++
	 */
	u32	put;
	u32	_reserved10;
	u32	_reserved11;
	u32	_reserved12;

	/* the driver/client reads from get until
	 * put==get, get++ */
	u32	get;
	u32	_reserved20;
	u32	_reserved21;
	u32	_reserved22;

	/* unused */
	u32	_reserved30;
	u32	_reserved31;
	u32	_reserved32;
	u32	_reserved33;
};

/* cycle stats fifo entry (must match NvSnapshotBufferFifoEntry) */
struct gk20a_cs_snapshot_fifo_entry {
	/* global 48 timestamp */
	u32	timestamp31_00:32;
	u32	timestamp39_32:8;

	/* id of perfmon, should correlate with CSS_MAX_PERFMON_IDS */
	u32	perfmon_id:8;

	/* typically samples_counter is wired to #pmtrigger count */
	u32	samples_counter:12;

	/* DS=Delay Sample, SZ=Size (0=32B, 1=16B) */
	u32	ds:1;
	u32	sz:1;
	u32	zero0:1;
	u32	zero1:1;

	/* counter results */
	u32	event_cnt:32;
	u32	trigger0_cnt:32;
	u32	trigger1_cnt:32;
	u32	sample_cnt:32;

	/* Local PmTrigger results for Maxwell+ or padding otherwise */
	u16	local_trigger_b_count:16;
	u16	book_mark_b:16;
	u16	local_trigger_a_count:16;
	u16	book_mark_a:16;
};


/* cycle stats snapshot client data (e.g. associated with channel) */
struct gk20a_cs_snapshot_client {
	struct list_head	list;
	u32			dmabuf_fd;
	struct dma_buf		*dma_handler;
	struct gk20a_cs_snapshot_fifo	*snapshot;
	u32			snapshot_size;
	u32			perfmon_start;
	u32			perfmon_count;
};

/* check client for pointed perfmon ownership */
#define CONTAINS_PERFMON(cl, pm)				\
		((cl)->perfmon_start <= (pm) &&			\
		((pm) - (cl)->perfmon_start) < (cl)->perfmon_count)

/* the minimal size of HW buffer - should be enough to avoid HW overflows */
#define CSS_MIN_HW_SNAPSHOT_SIZE	(8 * 1024 * 1024)

/* the minimal size of client buffer */
#define CSS_MIN_CLIENT_SNAPSHOT_SIZE				\
		(sizeof(struct gk20a_cs_snapshot_fifo) +	\
		sizeof(struct gk20a_cs_snapshot_fifo_entry) * 256)

/* address of fifo entry by offset */
#define CSS_FIFO_ENTRY(fifo, offs)				\
	((struct gk20a_cs_snapshot_fifo_entry *)(((char *)(fifo)) + (offs)))

/* calculate area capacity in number of fifo entries */
#define CSS_FIFO_ENTRY_CAPACITY(s)				\
	(((s) - sizeof(struct gk20a_cs_snapshot_fifo))		\
		/ sizeof(struct gk20a_cs_snapshot_fifo_entry))

/* reserved to indicate failures with data */
#define CSS_FIRST_PERFMON_ID	32
/* should correlate with size of gk20a_cs_snapshot_fifo_entry::perfmon_id */
#define CSS_MAX_PERFMON_IDS	256


/* this type is used for storing bits in perfmon mask */
typedef u32 css_perfmon_t;

/* local definitions to avoid hardcodes sizes and shifts */
#define PM_BITS		(sizeof(css_perfmon_t) * BITS_PER_BYTE)
#define PM_BITS_MASK	(PM_BITS - 1)

#define PM_BITMAP_SIZE	((CSS_MAX_PERFMON_IDS + PM_BITS - 1) / PM_BITS)

#define PM_SLOT(i)	((i) / PM_BITS)
#define PM_SHIFT(i)	((i) & PM_BITS_MASK)
#define PM_BIT(i)	(1u << PM_SHIFT(i))

#define CSS_PERFMON_GET(p, i)	(1 == ((p[PM_SLOT(i)] >> PM_SHIFT(i)) & 1))
#define CSS_PERFMON_USE(p, i)	(p[PM_SLOT(i)] |= PM_BIT(i))
#define CSS_PERFMON_REL(p, i)	(p[PM_SLOT(i)] &= ~PM_BIT(i))


/* cycle stats snapshot control structure for one HW entry and many clients */
struct gk20a_cs_snapshot {
	css_perfmon_t		perfmon_ids[PM_BITMAP_SIZE];
	struct list_head	clients;
	struct mem_desc		hw_memdesc;
	/* pointer to allocated cpu_va memory where GPU place data */
	struct gk20a_cs_snapshot_fifo_entry	*hw_snapshot;
	struct gk20a_cs_snapshot_fifo_entry	*hw_end;
	struct gk20a_cs_snapshot_fifo_entry	*hw_get;
};

/* reports whether the hw queue overflowed */
static inline bool css_hw_get_overflow_status(struct gk20a *g)
{
	const u32 st = perf_pmasys_control_membuf_status_overflowed_f();
	return st == (gk20a_readl(g, perf_pmasys_control_r()) & st);
}

/* returns how many pending snapshot entries are pending */
static inline u32 css_hw_get_pending_snapshots(struct gk20a *g)
{
	return gk20a_readl(g, perf_pmasys_mem_bytes_r()) /
			sizeof(struct gk20a_cs_snapshot_fifo_entry);
}

/* informs hw how many snapshots have been processed (frees up fifo space) */
static inline void css_hw_set_handled_snapshots(struct gk20a *g, u32 done)
{
	if (done > 0) {
		gk20a_writel(g, perf_pmasys_mem_bump_r(),
		     done * sizeof(struct gk20a_cs_snapshot_fifo_entry));
	}
}

/* disable streaming to memory */
static void css_hw_reset_streaming(struct gk20a *g)
{
	u32 engine_status;
	u32 old_pmc = gk20a_readl(g, mc_enable_r());

	/* reset the perfmon */
	gk20a_writel(g, mc_enable_r(),
				old_pmc & ~mc_enable_perfmon_enabled_f());
	gk20a_writel(g, mc_enable_r(), old_pmc);

	/* RBUFEMPTY must be set -- otherwise we'll pick up */
	/* snapshot that have been queued up from earlier   */
	engine_status = gk20a_readl(g, perf_pmasys_enginestatus_r());
	WARN_ON(0 == (engine_status
			& perf_pmasys_enginestatus_rbufempty_empty_f()));

	/* turn off writes */
	gk20a_writel(g, perf_pmasys_control_r(),
			perf_pmasys_control_membuf_clear_status_doit_f());

	/* pointing all pending snapshots as handled */
	css_hw_set_handled_snapshots(g, css_hw_get_pending_snapshots(g));
}

/*
 * WARNING: all css_gr_XXX functions are local and expected to be called
 * from locked context (protected by cs_lock)
 */

static int css_gr_create_shared_data(struct gr_gk20a *gr)
{
	struct gk20a_cs_snapshot *data;

	if (gr->cs_data)
		return 0;

	data = kzalloc(sizeof(*data), GFP_KERNEL);
	if (!data)
		return -ENOMEM;

	INIT_LIST_HEAD(&data->clients);
	gr->cs_data = data;

	return 0;
}

static int css_hw_enable_snapshot(struct gr_gk20a *gr, u32 snapshot_size)
{
	struct gk20a *g = gr->g;
	struct gk20a_cs_snapshot *data = gr->cs_data;
	int ret;

	u32 virt_addr_lo;
	u32 virt_addr_hi;
	u32 inst_pa_page;

	if (data->hw_snapshot)
		return 0;

	if (snapshot_size < CSS_MIN_HW_SNAPSHOT_SIZE)
		snapshot_size = CSS_MIN_HW_SNAPSHOT_SIZE;

	ret = gk20a_gmmu_alloc_map(&g->mm.pmu.vm, snapshot_size,
							&data->hw_memdesc);
	if (ret)
		return ret;

	/* perf output buffer may not cross a 4GB boundary - with a separate */
	/* va smaller than that, it won't but check anyway */
	if (!data->hw_memdesc.cpu_va ||
		data->hw_memdesc.size < snapshot_size ||
		data->hw_memdesc.gpu_va + u64_lo32(snapshot_size) > SZ_4G) {
		ret = -EFAULT;
		goto failed_allocation;
	}

	data->hw_snapshot =
		(struct gk20a_cs_snapshot_fifo_entry *)data->hw_memdesc.cpu_va;
	data->hw_end = data->hw_snapshot +
		snapshot_size / sizeof(struct gk20a_cs_snapshot_fifo_entry);
	data->hw_get = data->hw_snapshot;
	memset(data->hw_snapshot, 0xff, snapshot_size);

	/* address and size are aligned to 32 bytes, the lowest bits read back
	 * as zeros */
	virt_addr_lo = u64_lo32(data->hw_memdesc.gpu_va);
	virt_addr_hi = u64_hi32(data->hw_memdesc.gpu_va);

	css_hw_reset_streaming(g);

	gk20a_writel(g, perf_pmasys_outbase_r(), virt_addr_lo);
	gk20a_writel(g, perf_pmasys_outbaseupper_r(),
			perf_pmasys_outbaseupper_ptr_f(virt_addr_hi));
	gk20a_writel(g, perf_pmasys_outsize_r(), snapshot_size);

	/* this field is aligned to 4K */
	inst_pa_page = gk20a_mem_phys(&g->mm.hwpm.inst_block) >> 12;

	/* A write to MEM_BLOCK triggers the block bind operation. MEM_BLOCK
	 * should be written last */
	gk20a_writel(g, perf_pmasys_mem_block_r(),
			perf_pmasys_mem_block_base_f(inst_pa_page) |
			perf_pmasys_mem_block_valid_true_f() |
			perf_pmasys_mem_block_target_lfb_f());

	gk20a_dbg_info("cyclestats: buffer for hardware snapshots enabled\n");

	return 0;

failed_allocation:
	if (data->hw_memdesc.size) {
		gk20a_gmmu_unmap_free(&g->mm.pmu.vm, &data->hw_memdesc);
		memset(&data->hw_memdesc, 0, sizeof(data->hw_memdesc));
	}
	data->hw_snapshot = NULL;

	return ret;
}

static void css_hw_disable_snapshot(struct gr_gk20a *gr)
{
	struct gk20a *g = gr->g;
	struct gk20a_cs_snapshot *data = gr->cs_data;

	if (!data->hw_snapshot)
		return;

	css_hw_reset_streaming(g);

	gk20a_writel(g, perf_pmasys_outbase_r(), 0);
	gk20a_writel(g, perf_pmasys_outbaseupper_r(),
			perf_pmasys_outbaseupper_ptr_f(0));
	gk20a_writel(g, perf_pmasys_outsize_r(), 0);

	gk20a_writel(g, perf_pmasys_mem_block_r(),
			perf_pmasys_mem_block_base_f(0) |
			perf_pmasys_mem_block_valid_false_f() |
			perf_pmasys_mem_block_target_f(0));

	gk20a_gmmu_unmap_free(&g->mm.pmu.vm, &data->hw_memdesc);
	memset(&data->hw_memdesc, 0, sizeof(data->hw_memdesc));
	data->hw_snapshot = NULL;

	gk20a_dbg_info("cyclestats: buffer for hardware snapshots disabled\n");
}

static void css_gr_free_shared_data(struct gr_gk20a *gr)
{
	if (gr->cs_data) {
		/* the clients list is expected to be empty */
		css_hw_disable_snapshot(gr);

		/* release the objects */
		kfree(gr->cs_data);
		gr->cs_data = NULL;
	}
}


static struct gk20a_cs_snapshot_client*
css_gr_search_client(struct list_head *clients, u32 perfmon)
{
	struct list_head *pos;

	list_for_each(pos, clients) {
		struct gk20a_cs_snapshot_client *client =
			container_of(pos,
				struct gk20a_cs_snapshot_client, list);
		if (CONTAINS_PERFMON(client, perfmon))
			return client;
	}

	return NULL;
}

static int css_gr_flush_snapshots(struct gr_gk20a *gr)
{
	struct gk20a *g = gr->g;
	struct gk20a_cs_snapshot *css = gr->cs_data;
	struct gk20a_cs_snapshot_client *cur;
	u32 pending;

	/* variables for iterating over HW entries */
	u32 sid;
	struct gk20a_cs_snapshot_fifo_entry *src;

	/* due to data sharing with userspace we allowed update only */
	/* overflows and put field in the fifo header                */
	struct gk20a_cs_snapshot_fifo *dst;
	struct gk20a_cs_snapshot_fifo_entry *dst_get;
	struct gk20a_cs_snapshot_fifo_entry *dst_put;
	struct gk20a_cs_snapshot_fifo_entry *dst_head;
	struct gk20a_cs_snapshot_fifo_entry *dst_tail;

	if (!css)
		return -EINVAL;

	if (!css->hw_snapshot)
		return -EINVAL;

	if (list_empty(&css->clients))
		return -EBADF;

	/* check data available */
	pending = css_hw_get_pending_snapshots(g);
	if (!pending)
		return 0;

	if (css_hw_get_overflow_status(g)) {
		struct list_head *pos;

		list_for_each(pos, &css->clients) {
			cur = container_of(pos,
				struct gk20a_cs_snapshot_client, list);
			cur->snapshot->hw_overflow_events_occured++;
		}

		gk20a_warn(dev_from_gk20a(g),
			   "cyclestats: hardware overflow detected\n");
	}

	/* proceed all items in HW buffer */
	sid = 0;
	cur = NULL;
	dst = NULL;
	dst_put = NULL;
	src = css->hw_get;

	/* proceed all completed records */
	while (sid < pending && 0 == src->zero0) {
		/* we may have a new perfmon_id which required to */
		/* switch to a new client -> let's forget current */
		if (cur && !CONTAINS_PERFMON(cur, src->perfmon_id)) {
			dst->put = (char *)dst_put - (char *)dst;
			dst = NULL;
			cur = NULL;
		}

		/* now we have to select a new current client         */
		/* the client selection rate depends from experiment  */
		/* activity but on Android usually happened 1-2 times */
		if (!cur) {
			cur = css_gr_search_client(&css->clients,
							src->perfmon_id);
			if (cur) {
				/* found - setup all required data */
				dst = cur->snapshot;
				dst_get = CSS_FIFO_ENTRY(dst, dst->get);
				dst_put = CSS_FIFO_ENTRY(dst, dst->put);
				dst_head = CSS_FIFO_ENTRY(dst, dst->start);
				dst_tail = CSS_FIFO_ENTRY(dst, dst->end) - 1;
			} else {
				/* client not found - skipping this entry */
				gk20a_warn(dev_from_gk20a(g),
					   "cyclestats: orphaned perfmon %u\n",
							src->perfmon_id);
				goto next_hw_fifo_entry;
			}
		}

		/* check for software overflows */
		if (dst_put + 1 == dst_get ||
			(dst_put == dst_tail && dst_get == dst_head)) {
			/* no data copy, no pointer updates */
			dst->sw_overflow_events_occured++;
			gk20a_warn(dev_from_gk20a(g),
				   "cyclestats: perfmon %u soft overflow\n",
							src->perfmon_id);
		} else {
			*dst_put = *src;
			if (dst_put == dst_tail)
				dst_put = dst_head;
			else
				dst_put++;
		}

next_hw_fifo_entry:
		sid++;
		if (++src >= css->hw_end)
			src = css->hw_snapshot;
	}

	/* update client put pointer if necessary */
	if (cur && dst)
		dst->put = (char *)dst_put - (char *)dst;

	/* re-set HW buffer after processing taking wrapping into account */
	if (css->hw_get < src) {
		memset(css->hw_get, 0xff, (src - css->hw_get) * sizeof(*src));
	} else {
		memset(css->hw_snapshot, 0xff,
				(src - css->hw_snapshot) * sizeof(*src));
		memset(css->hw_get, 0xff,
				(css->hw_end - css->hw_get) * sizeof(*src));
	}
	gr->cs_data->hw_get = src;
	css_hw_set_handled_snapshots(g, sid);
	if (pending != sid) {
		/* not all entries proceed correctly. some of problems */
		/* reported as overflows, some as orphaned perfmons,   */
		/* but it will be better notify with summary about it  */
		gk20a_warn(dev_from_gk20a(g),
			   "cyclestats: done %u from %u entries\n",
							sid, pending);
	}

	return 0;
}

static u32 css_gr_allocate_perfmon_ids(struct gk20a_cs_snapshot *data,
				       u32 count)
{
	u32 *pids = data->perfmon_ids;
	u32  f;
	u32  e = CSS_MAX_PERFMON_IDS - count;

	if (!count || count > CSS_MAX_PERFMON_IDS - CSS_FIRST_PERFMON_ID)
		return 0;

	for (f = CSS_FIRST_PERFMON_ID; f < e; f++) {
		u32 slots = 0;
		u32 cur;
		u32 end = f + count;

		/* lookup for continuous hole [f, f+count) of unused bits */
		for (cur = f; cur < end; cur++) {
			if (CSS_PERFMON_GET(pids, cur))
				break;
			slots++;
		}

		if (count == slots) {
			/* we found of hole of unused bits with required */
			/* length -> can occupy it for our perfmon IDs   */
			for (cur = f; cur < end; cur++)
				CSS_PERFMON_USE(pids, cur);

			return f;
		}
	}

	return 0;
}

static u32 css_gr_release_perfmon_ids(struct gk20a_cs_snapshot *data,
				      u32 start,
				      u32 count)
{
	u32 *pids = data->perfmon_ids;
	u32  end = start + count;
	u32  cnt = 0;

	if (start >= CSS_FIRST_PERFMON_ID && end <= CSS_MAX_PERFMON_IDS) {
		u32  i;
		for (i = start; i < end; i++) {
			if (CSS_PERFMON_GET(pids, i)) {
				CSS_PERFMON_REL(pids, i);
				cnt++;
			}
		}
	}

	return cnt;
}


static int css_gr_free_client_data(struct gk20a_cs_snapshot *data,
				struct gk20a_cs_snapshot_client *client)
{
	int ret = 0;

	list_del(&client->list);
	if (client->perfmon_start && client->perfmon_count) {
		if (client->perfmon_count != css_gr_release_perfmon_ids(data,
				client->perfmon_start, client->perfmon_count))
			ret = -EINVAL;
	}
	if (client->dma_handler) {
		dma_buf_vunmap(client->dma_handler, client->snapshot);
		dma_buf_put(client->dma_handler);
	}

	kfree(client);

	return ret;
}

static int css_gr_create_client_data(struct gk20a_cs_snapshot *data,
			u32 dmabuf_fd, u32 perfmon_count,
			struct gk20a_cs_snapshot_client **client)
{
	struct gk20a_cs_snapshot_client *cur;
	int ret = 0;

	cur = kzalloc(sizeof(*cur), GFP_KERNEL);
	if (!cur) {
		ret = -ENOMEM;
		goto failed;
	}

	cur->dmabuf_fd   = dmabuf_fd;
	cur->dma_handler = dma_buf_get(cur->dmabuf_fd);
	if (IS_ERR(cur->dma_handler)) {
		ret = PTR_ERR(cur->dma_handler);
		cur->dma_handler = NULL;
		goto failed;
	}

	cur->snapshot = (struct gk20a_cs_snapshot_fifo *)
					dma_buf_vmap(cur->dma_handler);
	if (!cur->snapshot) {
		ret = -ENOMEM;
		goto failed;
	}

	cur->snapshot_size = cur->dma_handler->size;
	if (cur->snapshot_size < CSS_MIN_CLIENT_SNAPSHOT_SIZE) {
		ret = -ENOMEM;
		goto failed;
	}

	memset(cur->snapshot, 0, sizeof(*cur->snapshot));
	cur->snapshot->start = sizeof(*cur->snapshot);
	/* we should be ensure that can fit all fifo entries here */
	cur->snapshot->end =
		CSS_FIFO_ENTRY_CAPACITY(cur->snapshot_size)
			* sizeof(struct gk20a_cs_snapshot_fifo_entry)
			+ sizeof(struct gk20a_cs_snapshot_fifo);
	cur->snapshot->get = cur->snapshot->start;
	cur->snapshot->put = cur->snapshot->start;

	cur->perfmon_count = perfmon_count;
	if (cur->perfmon_count) {
		cur->perfmon_start = css_gr_allocate_perfmon_ids(data,
							cur->perfmon_count);
		if (!cur->perfmon_start) {
			ret = -ENOENT;
			goto failed;
		}
	}

	list_add_tail(&cur->list, &data->clients);
	*client = cur;

	return 0;

failed:
	*client = NULL;
	if (cur)
		css_gr_free_client_data(data, cur);

	return ret;
}


int gr_gk20a_css_attach(struct gk20a *g,
			u32 dmabuf_fd,
			u32 perfmon_count,
			u32 *perfmon_start,
			struct gk20a_cs_snapshot_client **cs_client)
{
	int ret = 0;
	struct gr_gk20a *gr;

	if (!g->allow_all)
		return -EACCES;
	/* we must have a placeholder to store pointer to client structure */
	if (!cs_client)
		return -EINVAL;

	gr = &g->gr;
	*cs_client = NULL;

	mutex_lock(&gr->cs_lock);

	ret = css_gr_create_shared_data(gr);
	if (ret)
		goto failed;

	ret = css_gr_create_client_data(gr->cs_data,
				     dmabuf_fd,
				     perfmon_count,
				     cs_client);
	if (ret)
		goto failed;

	ret = css_hw_enable_snapshot(gr, (*cs_client)->snapshot_size);
	if (ret)
		goto failed;

	if (perfmon_start)
		*perfmon_start = (*cs_client)->perfmon_start;

	mutex_unlock(&gr->cs_lock);

	return 0;

failed:
	if (gr->cs_data) {
		if (*cs_client) {
			css_gr_free_client_data(gr->cs_data, *cs_client);
			*cs_client = NULL;
		}

		if (list_empty(&gr->cs_data->clients))
			css_gr_free_shared_data(gr);
	}
	mutex_unlock(&gr->cs_lock);

	if (perfmon_start)
		*perfmon_start = 0;

	return ret;
}

int gr_gk20a_css_detach(struct gk20a *g,
				struct gk20a_cs_snapshot_client *cs_client)
{
	int ret = 0;
	struct gr_gk20a *gr;

	if (!g->allow_all)
		return -EACCES;

	if (!cs_client)
		return -EINVAL;

	gr = &g->gr;
	mutex_lock(&gr->cs_lock);
	if (gr->cs_data) {
		struct gk20a_cs_snapshot *data = gr->cs_data;

		ret = css_gr_free_client_data(data, cs_client);
		if (list_empty(&data->clients))
			css_gr_free_shared_data(gr);
	} else {
		ret = -EBADF;
	}
	mutex_unlock(&gr->cs_lock);

	return ret;
}

int gr_gk20a_css_flush(struct gk20a *g,
				struct gk20a_cs_snapshot_client *cs_client)
{
	int ret = 0;
	struct gr_gk20a *gr;

	if (!g->allow_all)
		return -EACCES;

	if (!cs_client)
		return -EINVAL;

	gr = &g->gr;
	mutex_lock(&gr->cs_lock);
	ret = css_gr_flush_snapshots(gr);
	mutex_unlock(&gr->cs_lock);

	return ret;
}

/* helper function with locking to cleanup snapshot code code in gr_gk20a.c */
void gr_gk20a_free_cyclestats_snapshot_data(struct gk20a *g)
{
	struct gr_gk20a *gr = &g->gr;

	mutex_lock(&gr->cs_lock);
	css_gr_free_shared_data(gr);
	mutex_unlock(&gr->cs_lock);
}