#if 0 /* done in Makefile */ #define _GNU_SOURCE /* or _BSD_SOURCE or _SVID_SOURCE */ #endif #include "asm/unistd.h" /* from kernel source tree */ #include /* for syscall */ #include #include #include #include #include /* rt_param needs uint32 */ #include "../../litmus-rt/include/linux/perf_event.h" #include /* page size macro */ /* from kernel tools/perf/perf.h */ static inline int sys_perf_event_open(struct perf_event_attr *attr, pid_t pid, int cpu, int group_fd, unsigned long flags) { attr->size = sizeof(*attr); return syscall(__NR_perf_event_open, attr, pid, cpu, group_fd, flags); } #define C(x) (PERF_COUNT_HW_CACHE_##x) #define ATTR_CONFIG_CACHE(cache, op, result) \ (((C(cache) & 0xffULL) << 0) | \ ((C(op) & 0xffULL) << 8) | \ ((C(result) & 0xffULL) << 16)) #define ATTR_CONFIG(event, umask) \ ((((event) & 0xffULL) << 0) | \ (((umask) & 0xffULL) << 8)) struct perf_event_attr perf_event_attr = { .type = 0, /* set per initilized event */ .size = 0, /* set later */ .config = 0, /* set per initilized event */ { .sample_period = 0, }, /* is a counter, so no period */ .disabled = 0, /* event is enabled */ .inherit = 0, /* children don't inherit */ .pinned = 0, /* set per initilized event */ .exclusive = 0, /* set per initilized event */ .exclude_user = 0, /* don't count user */ .exclude_kernel = 0, /* ditto kernel */ .exclude_hv = 0, /* ditto hypervisor */ .exclude_idle = 0, /* don't count when idle */ .mmap = 0, /* include mmap data */ .comm = 0, /* include comm data */ }; /* Pound */ #define NR_CPUS 4 #define CACHE_SIZE_MB 8 #define ASSOC 16 #define LINE_SIZE 64 #define CACHE_SIZE (CACHE_SIZE_MB * 1024 * 1024) /* arena size in bytes */ //#define ARENA_SIZE (CACHE_SIZE * 14 / 16) #define ARENA_SIZE (CACHE_SIZE * 1) /* number of pages in arena */ #define ARENA_PAGES (ARENA_SIZE / PAGE_SIZE) /* number of cache lines per page */ #define PAGE_LINES (PAGE_SIZE / LINE_SIZE) /* number of cache lines in arena */ #define ARENA_LINES (ARENA_SIZE / LINE_SIZE) /* number of integers in arena */ #define ARENA_INTS (ARENA_SIZE / sizeof(int)) /* number of integers in a page */ #define PAGE_INTS (PAGE_SIZE / sizeof(int)) /* number of integers in a cache line */ #define LINE_INTS (LINE_SIZE / sizeof(int)) /* convert page number and cache line number to an integer index */ #define PAGE_AND_LINE_TO_IDX(page, line) \ (((page) * PAGE_INTS) + ((line) * LINE_INTS)) /* not really a good way to do this */ inline int randrange(const int max) { return (rand() / (RAND_MAX / max + 1)); } void sequential(int *items, const int len) { int i; for (i = 0; i < len; i++) items[i] = (i + 1) % len; } /* Sattolo's algorithm makes a cycle. */ void sattolo(int *items, const int len) { int i; for (i = 0; i < len; i++) items[i] = i; while (1 < i--) { /* 0 <= j < i */ int t, j = randrange(i); t = items[i]; items[i] = items[j]; items[j] = t; } } /* * Write the order to read the arena into the arena. Each page in the arena is * read back, but the page is read in a random order to prevent the prefetcher * from working. */ static void init_arena_page_line_order(int *arena, int *page_line_order) { int cur_page; for (cur_page = 0; cur_page < ARENA_PAGES; cur_page++) { /* for each page in the arena */ int cur_line; for (cur_line = 0; cur_line < PAGE_LINES; cur_line++) { /* for each line in the page */ const int idx = PAGE_AND_LINE_TO_IDX(cur_page, cur_line); const int next_line = page_line_order[cur_line]; int next_idx = PAGE_AND_LINE_TO_IDX(cur_page, next_line); if (!next_line) { /* special case: last line in the page */ if (cur_page < ARENA_PAGES - 1) { /* arena has more pages: go to next */ next_idx = PAGE_AND_LINE_TO_IDX( (cur_page + 1), 0); } else { /* the very last element */ next_idx = 0; } } arena[idx] = next_idx; } } } static int loop_once(const int perf_fd, int *arena) { int i = 0, j; do { i = arena[i]; j = i; } while (i); return j; } static int set_affinity(int cpu) { cpu_set_t cpu_set; CPU_ZERO(&cpu_set); CPU_SET(cpu, &cpu_set); return sched_setaffinity(0, sizeof(cpu_set_t), &cpu_set); } struct perf_fd { int fd; char *name; enum perf_type_id type; __u64 config; __u64 exclusive : 1, pinned : 1, __reserved_1 : 62; }; #define PERF_FD_EMPTY(p) \ ((p)->fd == 0 && (p)->name == NULL && \ (p)->type == 0 && (p)->config == 0) #define PERF_FD_NON_EMPTY(p) (!PERF_FD_EMPTY(p)) #if 0 /* these events are always zero */ static struct perf_fd perf_fds[] = { { .fd = -1, .name = "MEM_UNCORE_RETIRED.REMOTE_CACHE_LOCAL_HOME_HIT", .type = PERF_TYPE_RAW, .config = ATTR_CONFIG(0x0f, 0x08), .exclusive = 0, .pinned = 0, }, { .fd = -1, .name = "MEM_UNCORE_RETIRED.REMOTE_DRAM", .type = PERF_TYPE_RAW, .config = ATTR_CONFIG(0x0f, 0x10), .exclusive = 0, /* child events cannot be exclusive */ .pinned = 0, /* child events cannot be pinned */ }, { }, }; #endif static struct perf_fd perf_fds[] = { /* first element is assumed to be group leader */ #if 0 { .fd = -1, .name = "MEM_UNCORE_RETIRED.LOCAL_DRAM", .type = PERF_TYPE_RAW, .config = ATTR_CONFIG(0x0f, 0x20), .exclusive = 1, /* group leader is scheduled exclusively */ .pinned = 1, /* group leader is pinnned to CPU (always on) */ }, #endif { .fd = -1, .name = "L2_RQSTS.PREFETCH_HIT", .type = PERF_TYPE_RAW, .config = ATTR_CONFIG(0x24, 0x40), #if 0 .exclusive = 0, .pinned = 0, #endif .exclusive = 1, /* group leader is scheduled exclusively */ .pinned = 1, /* group leader is pinnned to CPU (always on) */ }, { .fd = -1, .name = "L2_RQSTS.PREFETCH_MISS", .type = PERF_TYPE_RAW, .config = ATTR_CONFIG(0x24, 0x80), .exclusive = 0, .pinned = 0, }, { .fd = -1, .name = "MEM_LOAD_RETIRED.L3_MISS", .type = PERF_TYPE_RAW, .config = ATTR_CONFIG(0xcb, 0x10), .exclusive = 0, .pinned = 0, }, { .fd = -1, .name = "Off Core Response Counter", .type = PERF_TYPE_HW_CACHE, .config = ATTR_CONFIG_CACHE(LL, OP_READ, RESULT_MISS), #if 0 /* read misses */ .config = ATTR_CONFIG_CACHE(LL, OP_READ, RESULT_MISS), /* write misses */ .config = ATTR_CONFIG_CACHE(LL, OP_WRITE, RESULT_MISS), /* prefetch misses */ .config = ATTR_CONFIG_CACHE(LL, OP_PREFETCH, RESULT_MISS), #endif .exclusive = 0, .pinned = 0, }, { }, }; static inline void events_ioctl(const int request) { ioctl(perf_fds[0].fd, request); } static void do_read(double divide) { struct perf_fd *perf_fd; for (perf_fd = perf_fds; PERF_FD_NON_EMPTY(perf_fd); perf_fd++) { __u64 perf_val; ssize_t ret; ret = read(perf_fd->fd, &perf_val, sizeof(perf_val)); if (0 >= ret) printf("%50s: ERROR\n", perf_fd->name); else printf("%50s: %10.3f\n", perf_fd->name, (perf_val / divide)); ioctl(perf_fd->fd, PERF_EVENT_IOC_RESET); } } static void write_global_perf_attr(struct perf_fd *perf_fd) { perf_event_attr.type = perf_fd->type; perf_event_attr.config = perf_fd->config; perf_event_attr.exclusive = perf_fd->exclusive; perf_event_attr.pinned = perf_fd->pinned; } #define CPU 0 static int setup_perf(void) { /* cannot have pid == -1 and cpu == -1 */ const int perf_pid = -1; /* -1: all tasks, 0: this task */ const int perf_cpu = CPU; /* -1: all CPUs (follow task) */ struct perf_fd *perf_fd; int err = 0; for (perf_fd = perf_fds; PERF_FD_NON_EMPTY(perf_fd); perf_fd++) { /* make a group whose leader is the zeroth element */ const int perf_group = perf_fds[0].fd; /* setup the attributes to pass in */ write_global_perf_attr(perf_fd); perf_fd->fd = sys_perf_event_open(&perf_event_attr, perf_pid, perf_cpu, perf_group, 0); if (0 > perf_fd->fd) { fprintf(stderr, "could not setup %s\n", perf_fd->name); err = -1; goto out; } } out: return err; } int main(int argc, char **argv) { const int task_cpu = CPU; int ret = 0, i; int *arena, *page_line_order; if (set_affinity(task_cpu)) { fprintf(stderr, "could not set affinity\n"); ret = -1; goto out; } arena = malloc(ARENA_SIZE); if (!arena) { fprintf(stderr, "could not allocate memory\n"); ret = -1; goto out; } page_line_order = malloc(PAGE_LINES * sizeof(*page_line_order)); if (!page_line_order) { fprintf(stderr, "could not allocate memory\n"); ret = -1; goto out; } sattolo(page_line_order, PAGE_LINES); //sequential(page_line_order, PAGE_LINES); init_arena_page_line_order(arena, page_line_order); if (setup_perf()) { ret = -1; goto out; } printf("arena_size: %d\n", ARENA_SIZE); printf("arena_lines: %d\n", ARENA_LINES); printf("initially\n"); do_read(1.0); events_ioctl(PERF_EVENT_IOC_ENABLE); loop_once(perf_fds[0].fd, arena); events_ioctl(PERF_EVENT_IOC_DISABLE); printf("after a loop\n"); do_read(1.0); events_ioctl(PERF_EVENT_IOC_ENABLE); loop_once(perf_fds[0].fd, arena); events_ioctl(PERF_EVENT_IOC_DISABLE); printf("after another loop\n"); do_read(1.0); events_ioctl(PERF_EVENT_IOC_ENABLE); loop_once(perf_fds[0].fd, arena); events_ioctl(PERF_EVENT_IOC_DISABLE); printf("after another loop\n"); do_read(1.0); events_ioctl(PERF_EVENT_IOC_ENABLE); loop_once(perf_fds[0].fd, arena); events_ioctl(PERF_EVENT_IOC_DISABLE); printf("after another loop\n"); do_read(1.0); events_ioctl(PERF_EVENT_IOC_ENABLE); for (i = 0; i < 100; i++) loop_once(perf_fds[0].fd, arena); events_ioctl(PERF_EVENT_IOC_DISABLE); printf("after 100 loops\n"); do_read(100.0); out: return ret; }