summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/nvgpu/gm206
Commit message (Expand)AuthorAge
* gpu: nvgpu: Delete VBIOS overlay supportTerje Bergstrom2017-08-22
* gpu: nvgpu: Use os_linux.h only for debugfsTerje Bergstrom2017-08-22
* gpu: nvgpu: Copy vbios_min_version to struct gk20aTerje Bergstrom2017-08-22
* gpu: nvgpu: Use nvgpu flags for run_preosTerje Bergstrom2017-08-22
* gpu: nvgpu: Move debugfs fields to os_linuxTerje Bergstrom2017-08-04
* gpu: nvgpu: Move dev field from gk20a to nvgpu_os_linuxTerje Bergstrom2017-07-07
* gpu: nvgpu: use correct units for bios waitKonsta Holtta2017-07-07
* gpu: nvgpu: Reorg misc HAL initializationSunny He2017-07-06
* gpu: nvgpu: falcon bootstrap supportMahantesh Kumbar2017-07-06
* gpu: nvgpu: falcon copy to IMEM supportMahantesh Kumbar2017-07-05
* gpu: nvgpu: Falcon controller halt interrupt status clearMahantesh Kumbar2017-07-04
* gpu: nvgpu: Falcon controller wait for haltMahantesh Kumbar2017-07-04
* gpu: nvgpu: PMU IMEM/DMEM scrubbing cleanupMahantesh Kumbar2017-07-03
* gpu: nvgpu: Use accessor for finding struct deviceTerje Bergstrom2017-06-30
* gpu: nvgpu: rename pmu_mclk_gp106 to mclk_gp106Thomas Fleury2017-06-30
* gpu: nvgpu: determine memory configuration in halThomas Fleury2017-06-30
* gpu: nvgpu: PMU reset reorgMahantesh Kumbar2017-06-29
* gpu: nvgpu: backward support for HW2.5 Rev.AThomas Fleury2017-06-21
* gpu: nvgpu: Scrub gk20a_platform dependenciesTerje Bergstrom2017-05-18
* gpu: nvgpu: Put debugfs dependencies inside #ifdefTerje Bergstrom2017-04-24
* gpu: nvgpu: Use retry timer for short delaysTerje Bergstrom2017-04-21
* gpu: nvgpu: Add abstraction for firmware loadingTerje Bergstrom2017-04-19
* gpu: nvgpu: missing power sensor detectionThomas Fleury2017-04-18
* gpu: nvgpu: gm206: Use new delay APIsTerje Bergstrom2017-04-13
* gpu: nvgpu: gm206: Use new error macrosTerje Bergstrom2017-04-07
* gpu: nvgpu: Use nvgpu_timeout for all loopsTerje Bergstrom2017-03-27
* gpu: nvgpu: Use new kmem API functions (gm206/*)Alex Waterman2017-03-22
* gpu: nvgpu: Remove almost all gm204/gm206 supportTerje Bergstrom2017-02-17
* gpu: nvgpu: Generalize BIOS codeTerje Bergstrom2017-02-17
* gpu: nvgpu: Remove ACR gm204/gm206 supportMahantesh Kumbar2017-02-15
* gpu: nvgpu: Remove PMU gm204/gm206 supportMahantesh Kumbar2017-02-15
* gpu: nvgpu: Organize semaphore_gk20a.[ch]Alex Waterman2017-02-13
* gpu: nvgpu: Organize nvgpu_common.[ch]Alex Waterman2017-02-13
* gpu: nvgpu: Add enable/disable shadow ROM HALTerje Bergstrom2017-02-13
* drivers: gpu: nvgpu: Use soc/tegra/fuse.h for fuse headerLaxman Dewangan2017-01-19
* nvgpu: pmu: Use ops to get PMU queue HEAD/TAIL.Deepak Goyal2017-01-18
* gpu: nvgpu: HAL to query LPWR feature supportMahantesh Kumbar2017-01-16
* gpu: nvgpu: Move gm206 HW headersAlex Waterman2017-01-11
* gpu: nvgpu: Release VBIOS firmware if alloc failsTerje Bergstrom2017-01-04
* gpu: nvgpu: MSCG supportMahantesh Kumbar2016-12-26
* gpu: nvgpu: PG engines init/allow/disallow updateMahantesh Kumbar2016-12-20
* gpu: nvgpu: Wait for full UDE completionTerje Bergstrom2016-12-05
* gpu: nvgpu: FBPA broadcast supporttk2016-11-17
* gpu: nvgpu: use define macros for litter valuesseshendra Gadagottu2016-11-17
* gpu: nvgpu: Fix signed comparison bugsTerje Bergstrom2016-11-17
* gpu: nvgpu: Add thermal module supportLakshmanan M2016-11-03
* gpu: nvgpu: pmu HAL updateseshendra Gadagottu2016-11-01
* gpu: nvgpu: voltage changesMahantesh Kumbar2016-10-30
* gpu: nvgpu: Add pmgr supportLakshmanan M2016-10-27
* gpu: nvgpu: gm206: fix out of boundary memory accessRichard Zhao2016-10-21
a> 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980
/*
 * GK20A Channel Synchronization Abstraction
 *
 * Copyright (c) 2014-2017, NVIDIA CORPORATION.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
 * version 2, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
 * more details.
 */

#include <linux/gk20a.h>

#include <linux/version.h>

#include <nvgpu/semaphore.h>
#include <nvgpu/kmem.h>
#include <nvgpu/log.h>
#include <nvgpu/atomic.h>
#include <nvgpu/bug.h>
#include <nvgpu/list.h>

#include "channel_sync_gk20a.h"
#include "gk20a.h"
#include "fence_gk20a.h"
#include "sync_gk20a.h"
#include "mm_gk20a.h"

#ifdef CONFIG_SYNC
#include "../drivers/staging/android/sync.h"
#endif

#ifdef CONFIG_TEGRA_GK20A_NVHOST
#include <linux/nvhost.h>

struct gk20a_channel_syncpt {
	struct gk20a_channel_sync ops;
	struct channel_gk20a *c;
	struct platform_device *host1x_pdev;
	u32 id;
};

static void add_wait_cmd(struct gk20a *g, struct priv_cmd_entry *cmd, u32 off,
		u32 id, u32 thresh)
{
	off = cmd->off + off;
	/* syncpoint_a */
	nvgpu_mem_wr32(g, cmd->mem, off++, 0x2001001C);
	/* payload */
	nvgpu_mem_wr32(g, cmd->mem, off++, thresh);
	/* syncpoint_b */
	nvgpu_mem_wr32(g, cmd->mem, off++, 0x2001001D);
	/* syncpt_id, switch_en, wait */
	nvgpu_mem_wr32(g, cmd->mem, off++, (id << 8) | 0x10);
}

static int gk20a_channel_syncpt_wait_syncpt(struct gk20a_channel_sync *s,
		u32 id, u32 thresh, struct priv_cmd_entry *wait_cmd,
		struct gk20a_fence *fence)
{
	struct gk20a_channel_syncpt *sp =
		container_of(s, struct gk20a_channel_syncpt, ops);
	struct channel_gk20a *c = sp->c;
	int err = 0;

	if (!nvhost_syncpt_is_valid_pt_ext(sp->host1x_pdev, id)) {
		nvgpu_warn(c->g, "invalid wait id in gpfifo submit, elided");
		return 0;
	}

	if (nvhost_syncpt_is_expired_ext(sp->host1x_pdev, id, thresh))
		return 0;

	err = gk20a_channel_alloc_priv_cmdbuf(c, 4, wait_cmd);
	if (err) {
		nvgpu_err(c->g,
				"not enough priv cmd buffer space");
		return err;
	}

	add_wait_cmd(c->g, wait_cmd, 0, id, thresh);

	return 0;
}

static int gk20a_channel_syncpt_wait_fd(struct gk20a_channel_sync *s, int fd,
		       struct priv_cmd_entry *wait_cmd,
		       struct gk20a_fence *fence)
{
#ifdef CONFIG_SYNC
	int i;
	int num_wait_cmds;
	struct sync_fence *sync_fence;
	struct sync_pt *pt;
	struct gk20a_channel_syncpt *sp =
		container_of(s, struct gk20a_channel_syncpt, ops);
	struct channel_gk20a *c = sp->c;
	u32 wait_id;
	int err = 0;

	sync_fence = nvhost_sync_fdget(fd);
	if (!sync_fence)
		return -EINVAL;

	/* validate syncpt ids */
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0)
	list_for_each_entry(pt, &sync_fence->pt_list_head, pt_list) {
#else
	for (i = 0; i < sync_fence->num_fences; i++) {
		pt = sync_pt_from_fence(sync_fence->cbs[i].sync_pt);
#endif
		wait_id = nvhost_sync_pt_id(pt);
		if (!wait_id || !nvhost_syncpt_is_valid_pt_ext(sp->host1x_pdev,
					wait_id)) {
			sync_fence_put(sync_fence);
			return -EINVAL;
		}
#if !(LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0))
	}
#else
	}
#endif

	num_wait_cmds = nvhost_sync_num_pts(sync_fence);
	if (num_wait_cmds == 0) {
		sync_fence_put(sync_fence);
		return 0;
	}

	err = gk20a_channel_alloc_priv_cmdbuf(c, 4 * num_wait_cmds, wait_cmd);
	if (err) {
		nvgpu_err(c->g,
				"not enough priv cmd buffer space");
		sync_fence_put(sync_fence);
		return err;
	}

	i = 0;
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0)
	list_for_each_entry(pt, &sync_fence->pt_list_head, pt_list) {
#else
	for (i = 0; i < sync_fence->num_fences; i++) {
		struct fence *f = sync_fence->cbs[i].sync_pt;
		struct sync_pt *pt = sync_pt_from_fence(f);
#endif
		u32 wait_id = nvhost_sync_pt_id(pt);
		u32 wait_value = nvhost_sync_pt_thresh(pt);

		if (nvhost_syncpt_is_expired_ext(sp->host1x_pdev,
				wait_id, wait_value)) {
			/* each wait_cmd is 4 u32s */
			nvgpu_memset(c->g, wait_cmd->mem,
					(wait_cmd->off + i * 4) * sizeof(u32),
					0, 4 * sizeof(u32));
		} else
			add_wait_cmd(c->g, wait_cmd, i * 4, wait_id,
					wait_value);
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0)
		i++;
	}
#else
	}
#endif

	WARN_ON(i != num_wait_cmds);
	sync_fence_put(sync_fence);

	return 0;
#else
	return -ENODEV;
#endif
}

static void gk20a_channel_syncpt_update(void *priv, int nr_completed)
{
	struct channel_gk20a *ch = priv;

	gk20a_channel_update(ch);

	/* note: channel_get() is in __gk20a_channel_syncpt_incr() */
	gk20a_channel_put(ch);
}

static int __gk20a_channel_syncpt_incr(struct gk20a_channel_sync *s,
				       bool wfi_cmd,
				       bool register_irq,
				       struct priv_cmd_entry *incr_cmd,
				       struct gk20a_fence *fence,
				       bool need_sync_fence)
{
	u32 thresh;
	size_t incr_cmd_size;
	int off;
	int err;
	struct gk20a_channel_syncpt *sp =
		container_of(s, struct gk20a_channel_syncpt, ops);
	struct channel_gk20a *c = sp->c;

	incr_cmd_size = 6;
	if (wfi_cmd)
		incr_cmd_size += 2;

	err = gk20a_channel_alloc_priv_cmdbuf(c, incr_cmd_size, incr_cmd);
	if (err)
		return err;

	off = incr_cmd->off;

	/* WAR for hw bug 1491360: syncpt needs to be incremented twice */

	if (wfi_cmd) {
		/* wfi */
		nvgpu_mem_wr32(c->g, incr_cmd->mem, off++, 0x2001001E);
		/* handle, ignored */
		nvgpu_mem_wr32(c->g, incr_cmd->mem, off++, 0x00000000);
	}
	/* syncpoint_a */
	nvgpu_mem_wr32(c->g, incr_cmd->mem, off++, 0x2001001C);
	/* payload, ignored */
	nvgpu_mem_wr32(c->g, incr_cmd->mem, off++, 0);
	/* syncpoint_b */
	nvgpu_mem_wr32(c->g, incr_cmd->mem, off++, 0x2001001D);
	/* syncpt_id, incr */
	nvgpu_mem_wr32(c->g, incr_cmd->mem, off++, (sp->id << 8) | 0x1);
	/* syncpoint_b */
	nvgpu_mem_wr32(c->g, incr_cmd->mem, off++, 0x2001001D);
	/* syncpt_id, incr */
	nvgpu_mem_wr32(c->g, incr_cmd->mem, off++, (sp->id << 8) | 0x1);

	WARN_ON(off - incr_cmd->off != incr_cmd_size);

	thresh = nvhost_syncpt_incr_max_ext(sp->host1x_pdev, sp->id, 2);

	if (register_irq) {
		struct channel_gk20a *referenced = gk20a_channel_get(c);

		WARN_ON(!referenced);

		if (referenced) {
			/* note: channel_put() is in
			 * gk20a_channel_syncpt_update() */

			err = nvhost_intr_register_notifier(
				sp->host1x_pdev,
				sp->id, thresh,
				gk20a_channel_syncpt_update, c);
			if (err)
				gk20a_channel_put(referenced);

			/* Adding interrupt action should
			 * never fail. A proper error handling
			 * here would require us to decrement
			 * the syncpt max back to its original
			 * value. */
			WARN(err,
			     "failed to set submit complete interrupt");
		}
	}

	err = gk20a_fence_from_syncpt(fence, sp->host1x_pdev, sp->id, thresh,
					 wfi_cmd, need_sync_fence);
	if (err)
		goto clean_up_priv_cmd;

	return 0;

clean_up_priv_cmd:
	gk20a_free_priv_cmdbuf(c, incr_cmd);
	return err;
}

static int gk20a_channel_syncpt_incr_wfi(struct gk20a_channel_sync *s,
				  struct priv_cmd_entry *entry,
				  struct gk20a_fence *fence)
{
	return __gk20a_channel_syncpt_incr(s,
			true /* wfi */,
			false /* no irq handler */,
			entry, fence, true);
}

static int gk20a_channel_syncpt_incr(struct gk20a_channel_sync *s,
			      struct priv_cmd_entry *entry,
			      struct gk20a_fence *fence,
			      bool need_sync_fence,
			      bool register_irq)
{
	/* Don't put wfi cmd to this one since we're not returning
	 * a fence to user space. */
	return __gk20a_channel_syncpt_incr(s,
			false /* no wfi */,
			register_irq /* register irq */,
			entry, fence, need_sync_fence);
}

static int gk20a_channel_syncpt_incr_user(struct gk20a_channel_sync *s,
				   int wait_fence_fd,
				   struct priv_cmd_entry *entry,
				   struct gk20a_fence *fence,
				   bool wfi,
				   bool need_sync_fence,
				   bool register_irq)
{
	/* Need to do 'wfi + host incr' since we return the fence
	 * to user space. */
	return __gk20a_channel_syncpt_incr(s,
			wfi,
			register_irq /* register irq */,
			entry, fence, need_sync_fence);
}

static void gk20a_channel_syncpt_set_min_eq_max(struct gk20a_channel_sync *s)
{
	struct gk20a_channel_syncpt *sp =
		container_of(s, struct gk20a_channel_syncpt, ops);
	nvhost_syncpt_set_min_eq_max_ext(sp->host1x_pdev, sp->id);
}

static void gk20a_channel_syncpt_signal_timeline(
		struct gk20a_channel_sync *s)
{
	/* Nothing to do. */
}

static int gk20a_channel_syncpt_id(struct gk20a_channel_sync *s)
{
	struct gk20a_channel_syncpt *sp =
		container_of(s, struct gk20a_channel_syncpt, ops);
	return sp->id;
}

static void gk20a_channel_syncpt_destroy(struct gk20a_channel_sync *s)
{
	struct gk20a_channel_syncpt *sp =
		container_of(s, struct gk20a_channel_syncpt, ops);
	nvhost_syncpt_set_min_eq_max_ext(sp->host1x_pdev, sp->id);
	nvhost_syncpt_put_ref_ext(sp->host1x_pdev, sp->id);
	nvgpu_kfree(sp->c->g, sp);
}

static struct gk20a_channel_sync *
gk20a_channel_syncpt_create(struct channel_gk20a *c)
{
	struct gk20a_channel_syncpt *sp;
	char syncpt_name[32];

	sp = nvgpu_kzalloc(c->g, sizeof(*sp));
	if (!sp)
		return NULL;

	sp->c = c;
	sp->host1x_pdev = c->g->host1x_dev;

	snprintf(syncpt_name, sizeof(syncpt_name),
		"%s_%d", c->g->name, c->hw_chid);

	sp->id = nvhost_get_syncpt_host_managed(sp->host1x_pdev,
						c->hw_chid, syncpt_name);
	if (!sp->id) {
		nvgpu_kfree(c->g, sp);
		nvgpu_err(c->g, "failed to get free syncpt");
		return NULL;
	}

	nvhost_syncpt_set_min_eq_max_ext(sp->host1x_pdev, sp->id);

	atomic_set(&sp->ops.refcount, 0);
	sp->ops.wait_syncpt		= gk20a_channel_syncpt_wait_syncpt;
	sp->ops.wait_fd			= gk20a_channel_syncpt_wait_fd;
	sp->ops.incr			= gk20a_channel_syncpt_incr;
	sp->ops.incr_wfi		= gk20a_channel_syncpt_incr_wfi;
	sp->ops.incr_user		= gk20a_channel_syncpt_incr_user;
	sp->ops.set_min_eq_max		= gk20a_channel_syncpt_set_min_eq_max;
	sp->ops.signal_timeline		= gk20a_channel_syncpt_signal_timeline;
	sp->ops.syncpt_id		= gk20a_channel_syncpt_id;
	sp->ops.destroy			= gk20a_channel_syncpt_destroy;

	return &sp->ops;
}
#endif /* CONFIG_TEGRA_GK20A_NVHOST */

struct gk20a_channel_semaphore {
	struct gk20a_channel_sync ops;
	struct channel_gk20a *c;

	/* A semaphore pool owned by this channel. */
	struct nvgpu_semaphore_pool *pool;

	/* A sync timeline that advances when gpu completes work. */
	struct sync_timeline *timeline;
};

#ifdef CONFIG_SYNC
struct wait_fence_work {
	struct sync_fence_waiter waiter;
	struct sync_fence *fence;
	struct channel_gk20a *ch;
	struct nvgpu_semaphore *sema;
	struct gk20a *g;
	struct nvgpu_list_node entry;
};

static inline struct wait_fence_work *
wait_fence_work_from_entry(struct nvgpu_list_node *node)
{
	return (struct wait_fence_work *)
		((uintptr_t)node - offsetof(struct wait_fence_work, entry));
};

/*
 * Keep track of all the pending waits on semaphores that exist for a GPU. This
 * has to be done because the waits on fences backed by semaphores are
 * asynchronous so it's impossible to otherwise know when they will fire. During
 * driver cleanup this list can be checked and all existing waits can be
 * canceled.
 */
static void gk20a_add_pending_sema_wait(struct gk20a *g,
					struct wait_fence_work *work)
{
	nvgpu_raw_spinlock_acquire(&g->pending_sema_waits_lock);
	nvgpu_list_add(&work->entry, &g->pending_sema_waits);
	nvgpu_raw_spinlock_release(&g->pending_sema_waits_lock);
}

/*
 * Copy the list head from the pending wait list to the passed list and
 * then delete the entire pending list.
 */
static void gk20a_start_sema_wait_cancel(struct gk20a *g,
					 struct nvgpu_list_node *list)
{
	nvgpu_raw_spinlock_acquire(&g->pending_sema_waits_lock);
	nvgpu_list_replace_init(&g->pending_sema_waits, list);
	nvgpu_raw_spinlock_release(&g->pending_sema_waits_lock);
}

/*
 * During shutdown this should be called to make sure that any pending sema
 * waits are canceled. This is a fairly delicate and tricky bit of code. Here's
 * how it works.
 *
 * Every time a semaphore wait is initiated in SW the wait_fence_work struct is
 * added to the pending_sema_waits list. When the semaphore launcher code runs
 * it checks the pending_sema_waits list. If this list is non-empty that means
 * that the wait_fence_work struct must be present and can be removed.
 *
 * When the driver shuts down one of the steps is to cancel pending sema waits.
 * To do this the entire list of pending sema waits is removed (and stored in a
 * separate local list). So now, if the semaphore launcher code runs it will see
 * that the pending_sema_waits list is empty and knows that it no longer owns
 * the wait_fence_work struct.
 */
void gk20a_channel_cancel_pending_sema_waits(struct gk20a *g)
{
	struct wait_fence_work *work;
	struct nvgpu_list_node local_pending_sema_waits;

	gk20a_start_sema_wait_cancel(g, &local_pending_sema_waits);

	while (!nvgpu_list_empty(&local_pending_sema_waits)) {
		int ret;

		work = nvgpu_list_first_entry(&local_pending_sema_waits,
					wait_fence_work,
					entry);

		nvgpu_list_del(&work->entry);

		/*
		 * Only nvgpu_kfree() work if the cancel is successful.
		 * Otherwise it's in use by the
		 * gk20a_channel_semaphore_launcher() code.
		 */
		ret = sync_fence_cancel_async(work->fence, &work->waiter);
		if (ret == 0)
			nvgpu_kfree(g, work);
	}
}

static void gk20a_channel_semaphore_launcher(
		struct sync_fence *fence,
		struct sync_fence_waiter *waiter)
{
	int err;
	struct wait_fence_work *w =
		container_of(waiter, struct wait_fence_work, waiter);
	struct gk20a *g = w->g;

	/*
	 * This spinlock must protect a _very_ small critical section -
	 * otherwise it's possible that the deterministic submit path suffers.
	 */
	nvgpu_raw_spinlock_acquire(&g->pending_sema_waits_lock);
	if (!nvgpu_list_empty(&g->pending_sema_waits))
		nvgpu_list_del(&w->entry);
	nvgpu_raw_spinlock_release(&g->pending_sema_waits_lock);

	gk20a_dbg_info("waiting for pre fence %p '%s'",
			fence, fence->name);
	err = sync_fence_wait(fence, -1);
	if (err < 0)
		nvgpu_err(g, "error waiting pre-fence: %d\n", err);

	gk20a_dbg_info(
		  "wait completed (%d) for fence %p '%s', triggering gpu work",
		  err, fence, fence->name);
	sync_fence_put(fence);
	nvgpu_semaphore_release(w->sema);
	nvgpu_semaphore_put(w->sema);
	nvgpu_kfree(g, w);
}
#endif

static void add_sema_cmd(struct gk20a *g, struct channel_gk20a *c,
			 struct nvgpu_semaphore *s, struct priv_cmd_entry *cmd,
			 int cmd_size, bool acquire, bool wfi)
{
	int ch = c->hw_chid;
	u32 ob, off = cmd->off;
	u64 va;

	ob = off;

	/*
	 * RO for acquire (since we just need to read the mem) and RW for
	 * release since we will need to write back to the semaphore memory.
	 */
	va = acquire ? nvgpu_semaphore_gpu_ro_va(s) :
		       nvgpu_semaphore_gpu_rw_va(s);

	/*
	 * If the op is not an acquire (so therefor a release) we should
	 * incr the underlying sema next_value.
	 */
	if (!acquire)
		nvgpu_semaphore_incr(s);

	/* semaphore_a */
	nvgpu_mem_wr32(g, cmd->mem, off++, 0x20010004);
	/* offset_upper */
	nvgpu_mem_wr32(g, cmd->mem, off++, (va >> 32) & 0xff);
	/* semaphore_b */
	nvgpu_mem_wr32(g, cmd->mem, off++, 0x20010005);
	/* offset */
	nvgpu_mem_wr32(g, cmd->mem, off++, va & 0xffffffff);

	if (acquire) {
		/* semaphore_c */
		nvgpu_mem_wr32(g, cmd->mem, off++, 0x20010006);
		/* payload */
		nvgpu_mem_wr32(g, cmd->mem, off++,
			       nvgpu_semaphore_get_value(s));
		/* semaphore_d */
		nvgpu_mem_wr32(g, cmd->mem, off++, 0x20010007);
		/* operation: acq_geq, switch_en */
		nvgpu_mem_wr32(g, cmd->mem, off++, 0x4 | (0x1 << 12));
	} else {
		/* semaphore_c */
		nvgpu_mem_wr32(g, cmd->mem, off++, 0x20010006);
		/* payload */
		nvgpu_mem_wr32(g, cmd->mem, off++,
			       nvgpu_semaphore_get_value(s));
		/* semaphore_d */
		nvgpu_mem_wr32(g, cmd->mem, off++, 0x20010007);
		/* operation: release, wfi */
		nvgpu_mem_wr32(g, cmd->mem, off++,
				0x2 | ((wfi ? 0x0 : 0x1) << 20));
		/* non_stall_int */
		nvgpu_mem_wr32(g, cmd->mem, off++, 0x20010008);
		/* ignored */
		nvgpu_mem_wr32(g, cmd->mem, off++, 0);
	}

	if (acquire)
		gpu_sema_verbose_dbg(g, "(A) c=%d ACQ_GE %-4u owner=%-3d"
				     "va=0x%llx cmd_mem=0x%llx b=0x%llx off=%u",
				     ch, nvgpu_semaphore_get_value(s),
				     s->hw_sema->ch->hw_chid, va, cmd->gva,
				     cmd->mem->gpu_va, ob);
	else
		gpu_sema_verbose_dbg(g, "(R) c=%d INCR %u (%u) va=0x%llx "
				     "cmd_mem=0x%llx b=0x%llx off=%u",
				     ch, nvgpu_semaphore_get_value(s),
				     nvgpu_semaphore_read(s), va, cmd->gva,
				     cmd->mem->gpu_va, ob);
}

static int gk20a_channel_semaphore_wait_syncpt(
		struct gk20a_channel_sync *s, u32 id,
		u32 thresh, struct priv_cmd_entry *entry,
		struct gk20a_fence *fence)
{
	struct gk20a_channel_semaphore *sema =
		container_of(s, struct gk20a_channel_semaphore, ops);
	struct gk20a *g = sema->c->g;
	nvgpu_err(g, "trying to use syncpoint synchronization");
	return -ENODEV;
}

#ifdef CONFIG_SYNC
/*
 * Attempt a fast path for waiting on a sync_fence. Basically if the passed
 * sync_fence is backed by a nvgpu_semaphore then there's no reason to go
 * through the rigmarole of setting up a separate semaphore which waits on an
 * interrupt from the GPU and then triggers a worker thread to execute a SW
 * based semaphore release. Instead just have the GPU wait on the same semaphore
 * that is going to be incremented by the GPU.
 *
 * This function returns 2 possible values: -ENODEV or 0 on success. In the case
 * of -ENODEV the fastpath cannot be taken due to the fence not being backed by
 * a GPU semaphore.
 */
static int __semaphore_wait_fd_fast_path(struct channel_gk20a *c,
					 struct sync_fence *fence,
					 struct priv_cmd_entry *wait_cmd,
					 struct nvgpu_semaphore **fp_sema)
{
	struct nvgpu_semaphore *sema;
	int err;

	if (!gk20a_is_sema_backed_sync_fence(fence))
		return -ENODEV;

	sema = gk20a_sync_fence_get_sema(fence);

	/*
	 * If there's no underlying sema then that means the underlying sema has
	 * already signaled.
	 */
	if (!sema) {
		*fp_sema = NULL;
		return 0;
	}

	err = gk20a_channel_alloc_priv_cmdbuf(c, 8, wait_cmd);
	if (err)
		return err;

	nvgpu_semaphore_get(sema);
	BUG_ON(!atomic_read(&sema->value));
	add_sema_cmd(c->g, c, sema, wait_cmd, 8, true, false);

	/*
	 * Make sure that gk20a_channel_semaphore_wait_fd() can create another
	 * fence with the underlying semaphore.
	 */
	*fp_sema = sema;

	return 0;
}
#endif

static int gk20a_channel_semaphore_wait_fd(
		struct gk20a_channel_sync *s, int fd,
		struct priv_cmd_entry *entry,
		struct gk20a_fence *fence)
{
	struct gk20a_channel_semaphore *sema =
		container_of(s, struct gk20a_channel_semaphore, ops);
	struct channel_gk20a *c = sema->c;
#ifdef CONFIG_SYNC
	struct nvgpu_semaphore *fp_sema;
	struct sync_fence *sync_fence;
	struct priv_cmd_entry *wait_cmd = entry;
	struct wait_fence_work *w = NULL;
	int err, ret, status;

	sync_fence = gk20a_sync_fence_fdget(fd);
	if (!sync_fence)
		return -EINVAL;

	ret = __semaphore_wait_fd_fast_path(c, sync_fence, wait_cmd, &fp_sema);
	if (ret == 0) {
		if (fp_sema) {
			err = gk20a_fence_from_semaphore(c->g, fence,
					sema->timeline,
					fp_sema,
					&c->semaphore_wq,
					false, false);
			if (err) {
				nvgpu_semaphore_put(fp_sema);
				goto clean_up_priv_cmd;
			}
		} else
			/*
			 * Init an empty fence. It will instantly return
			 * from gk20a_fence_wait().
			 */
			gk20a_init_fence(fence, NULL, NULL, false);

		sync_fence_put(sync_fence);
		goto skip_slow_path;
	}

	/* If the fence has signaled there is no reason to wait on it. */
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0)
	status = sync_fence->status;
#else
	status = atomic_read(&sync_fence->status);
#endif
	if (status == 0) {
		sync_fence_put(sync_fence);
		goto skip_slow_path;
	}

	err = gk20a_channel_alloc_priv_cmdbuf(c, 8, wait_cmd);
	if (err) {
		nvgpu_err(c->g,
				"not enough priv cmd buffer space");
		goto clean_up_sync_fence;
	}

	w = nvgpu_kzalloc(c->g, sizeof(*w));
	if (!w) {
		err = -ENOMEM;
		goto clean_up_priv_cmd;
	}

	sync_fence_waiter_init(&w->waiter, gk20a_channel_semaphore_launcher);
	w->fence = sync_fence;
	w->g = c->g;
	w->ch = c;
	w->sema = nvgpu_semaphore_alloc(c);
	if (!w->sema) {
		nvgpu_err(c->g, "ran out of semaphores");
		err = -ENOMEM;
		goto clean_up_worker;
	}

	/* worker takes one reference */
	nvgpu_semaphore_get(w->sema);
	nvgpu_semaphore_incr(w->sema);

	/* GPU unblocked when the semaphore value increments. */
	add_sema_cmd(c->g, c, w->sema, wait_cmd, 8, true, false);

	/*
	 *  We need to create the fence before adding the waiter to ensure
	 *  that we properly clean up in the event the sync_fence has
	 *  already signaled
	 */
	err = gk20a_fence_from_semaphore(c->g, fence, sema->timeline, w->sema,
			&c->semaphore_wq, false, false);
	if (err)
		goto clean_up_sema;

	ret = sync_fence_wait_async(sync_fence, &w->waiter);
	gk20a_add_pending_sema_wait(c->g, w);

	/*
	 * If the sync_fence has already signaled then the above async_wait
	 * will never trigger. This causes the semaphore release op to never
	 * happen which, in turn, hangs the GPU. That's bad. So let's just
	 * do the nvgpu_semaphore_release() right now.
	 */
	if (ret == 1) {
		sync_fence_put(sync_fence);
		nvgpu_semaphore_release(w->sema);
		nvgpu_semaphore_put(w->sema);
	}

skip_slow_path:
	return 0;

clean_up_sema:
	/*
	 * Release the refs to the semaphore, including
	 * the one for the worker since it will never run.
	 */
	nvgpu_semaphore_put(w->sema);
	nvgpu_semaphore_put(w->sema);
clean_up_worker:
	nvgpu_kfree(c->g, w);
clean_up_priv_cmd:
	gk20a_free_priv_cmdbuf(c, entry);
clean_up_sync_fence:
	sync_fence_put(sync_fence);
	return err;
#else
	nvgpu_err(c->g,
		  "trying to use sync fds with CONFIG_SYNC disabled");
	return -ENODEV;
#endif
}

static int __gk20a_channel_semaphore_incr(
		struct gk20a_channel_sync *s, bool wfi_cmd,
		struct priv_cmd_entry *incr_cmd,
		struct gk20a_fence *fence,
		bool need_sync_fence)
{
	int incr_cmd_size;
	struct gk20a_channel_semaphore *sp =
		container_of(s, struct gk20a_channel_semaphore, ops);
	struct channel_gk20a *c = sp->c;
	struct nvgpu_semaphore *semaphore;
	int err = 0;

	semaphore = nvgpu_semaphore_alloc(c);
	if (!semaphore) {
		nvgpu_err(c->g,
				"ran out of semaphores");
		return -ENOMEM;
	}

	incr_cmd_size = 10;
	err = gk20a_channel_alloc_priv_cmdbuf(c, incr_cmd_size, incr_cmd);
	if (err) {
		nvgpu_err(c->g,
				"not enough priv cmd buffer space");
		goto clean_up_sema;
	}

	/* Release the completion semaphore. */
	add_sema_cmd(c->g, c, semaphore, incr_cmd, 14, false, wfi_cmd);

	err = gk20a_fence_from_semaphore(c->g, fence,
			sp->timeline, semaphore,
			&c->semaphore_wq,
			wfi_cmd,
			need_sync_fence);
	if (err)
		goto clean_up_sema;

	return 0;

clean_up_sema:
	nvgpu_semaphore_put(semaphore);
	return err;
}

static int gk20a_channel_semaphore_incr_wfi(
		struct gk20a_channel_sync *s,
		struct priv_cmd_entry *entry,
		struct gk20a_fence *fence)
{
	return __gk20a_channel_semaphore_incr(s,
			true /* wfi */,
			entry, fence, true);
}

static int gk20a_channel_semaphore_incr(
		struct gk20a_channel_sync *s,
		struct priv_cmd_entry *entry,
		struct gk20a_fence *fence,
		bool need_sync_fence,
		bool register_irq)
{
	/* Don't put wfi cmd to this one since we're not returning
	 * a fence to user space. */
	return __gk20a_channel_semaphore_incr(s,
			false /* no wfi */,
			entry, fence, need_sync_fence);
}

static int gk20a_channel_semaphore_incr_user(
		struct gk20a_channel_sync *s,
		int wait_fence_fd,
		struct priv_cmd_entry *entry,
		struct gk20a_fence *fence,
		bool wfi,
		bool need_sync_fence,
		bool register_irq)
{
#ifdef CONFIG_SYNC
	int err;

	err = __gk20a_channel_semaphore_incr(s, wfi, entry, fence,
			need_sync_fence);
	if (err)
		return err;

	return 0;
#else
	struct gk20a_channel_semaphore *sema =
		container_of(s, struct gk20a_channel_semaphore, ops);
	nvgpu_err(sema->c->g,
		  "trying to use sync fds with CONFIG_SYNC disabled");
	return -ENODEV;
#endif
}

static void gk20a_channel_semaphore_set_min_eq_max(struct gk20a_channel_sync *s)
{
	/* Nothing to do. */
}

static void gk20a_channel_semaphore_signal_timeline(
		struct gk20a_channel_sync *s)
{
	struct gk20a_channel_semaphore *sp =
		container_of(s, struct gk20a_channel_semaphore, ops);
	gk20a_sync_timeline_signal(sp->timeline);
}

static int gk20a_channel_semaphore_syncpt_id(struct gk20a_channel_sync *s)
{
	return -EINVAL;
}

static void gk20a_channel_semaphore_destroy(struct gk20a_channel_sync *s)
{
	struct gk20a_channel_semaphore *sema =
		container_of(s, struct gk20a_channel_semaphore, ops);
	if (sema->timeline)
		gk20a_sync_timeline_destroy(sema->timeline);

	/* The sema pool is cleaned up by the VM destroy. */
	sema->pool = NULL;

	nvgpu_kfree(sema->c->g, sema);
}

static struct gk20a_channel_sync *
gk20a_channel_semaphore_create(struct channel_gk20a *c)
{
	int asid = -1;
	struct gk20a_channel_semaphore *sema;
	char pool_name[20];

	if (WARN_ON(!c->vm))
		return NULL;

	sema = nvgpu_kzalloc(c->g, sizeof(*sema));
	if (!sema)
		return NULL;
	sema->c = c;

	if (c->vm->as_share)
		asid = c->vm->as_share->id;

	sprintf(pool_name, "semaphore_pool-%d", c->hw_chid);
	sema->pool = c->vm->sema_pool;

#ifdef CONFIG_SYNC
	sema->timeline = gk20a_sync_timeline_create(
			"gk20a_ch%d_as%d", c->hw_chid, asid);
	if (!sema->timeline) {
		gk20a_channel_semaphore_destroy(&sema->ops);
		return NULL;
	}
#endif
	atomic_set(&sema->ops.refcount, 0);
	sema->ops.wait_syncpt	= gk20a_channel_semaphore_wait_syncpt;
	sema->ops.wait_fd	= gk20a_channel_semaphore_wait_fd;
	sema->ops.incr		= gk20a_channel_semaphore_incr;
	sema->ops.incr_wfi	= gk20a_channel_semaphore_incr_wfi;
	sema->ops.incr_user	= gk20a_channel_semaphore_incr_user;
	sema->ops.set_min_eq_max = gk20a_channel_semaphore_set_min_eq_max;
	sema->ops.signal_timeline = gk20a_channel_semaphore_signal_timeline;
	sema->ops.syncpt_id	= gk20a_channel_semaphore_syncpt_id;
	sema->ops.destroy	= gk20a_channel_semaphore_destroy;

	return &sema->ops;
}

void gk20a_channel_sync_destroy(struct gk20a_channel_sync *sync)
{
	sync->destroy(sync);
}

struct gk20a_channel_sync *gk20a_channel_sync_create(struct channel_gk20a *c)
{
#ifdef CONFIG_TEGRA_GK20A_NVHOST
	if (gk20a_platform_has_syncpoints(c->g))
		return gk20a_channel_syncpt_create(c);
#endif
	return gk20a_channel_semaphore_create(c);
}

bool gk20a_channel_sync_needs_sync_framework(struct channel_gk20a *c)
{
	if (gk20a_platform_has_syncpoints(c->g))
		return false;
	return true;
}