v2: Introduce the appropriate pipe controls Properly deal with changes in metric sets (using execbuf parameter) Record marker at query end v3: Fill out PerfCntr1&2 v4: Introduce vkUninitializePerformanceApiINTEL v5: Use new execbuf extension mechanism v6: Fix comments in genX_query.c (Rafael) Use PIPE_CONTROL workarounds (Rafael) Refactor on the last kernel series update (Lionel) v7: Only I915_PERF_IOCTL_CONFIG when perf stream is already opened (Lionel) Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Rafael Antognolli <rafael.antognolli@intel.com>tags/19.3-branchpoint
@@ -305,6 +305,7 @@ LOCAL_WHOLE_STATIC_LIBRARIES := \ | |||
libmesa_compiler \ | |||
libmesa_intel_common \ | |||
libmesa_intel_dev \ | |||
libmesa_intel_perf \ | |||
libmesa_vulkan_common \ | |||
libmesa_vulkan_util \ | |||
libmesa_anv_gen7 \ |
@@ -259,6 +259,7 @@ VULKAN_FILES := \ | |||
vulkan/anv_nir_lower_push_constants.c \ | |||
vulkan/anv_nir_lower_ycbcr_textures.c \ | |||
vulkan/anv_pass.c \ | |||
vulkan/anv_perf.c \ | |||
vulkan/anv_pipeline.c \ | |||
vulkan/anv_pipeline_cache.c \ | |||
vulkan/anv_private.h \ |
@@ -604,6 +604,8 @@ anv_physical_device_init(struct anv_physical_device *device, | |||
goto fail; | |||
} | |||
device->perf = anv_get_perf(&device->info, fd); | |||
anv_physical_device_get_supported_extensions(device, | |||
&device->supported_extensions); | |||
@@ -625,6 +627,7 @@ anv_physical_device_finish(struct anv_physical_device *device) | |||
anv_finish_wsi(device); | |||
anv_physical_device_free_disk_cache(device); | |||
ralloc_free(device->compiler); | |||
ralloc_free(device->perf); | |||
close(device->local_fd); | |||
if (device->master_fd >= 0) | |||
close(device->master_fd); | |||
@@ -2657,6 +2660,8 @@ VkResult anv_CreateDevice( | |||
anv_device_init_border_colors(device); | |||
anv_device_perf_init(device); | |||
*pDevice = anv_device_to_handle(device); | |||
return VK_SUCCESS; |
@@ -165,6 +165,7 @@ EXTENSIONS = [ | |||
Extension('VK_ANDROID_native_buffer', 7, 'ANDROID'), | |||
Extension('VK_GOOGLE_decorate_string', 1, True), | |||
Extension('VK_GOOGLE_hlsl_functionality1', 1, True), | |||
Extension('VK_INTEL_performance_query', 1, 'device->perf'), | |||
Extension('VK_NV_compute_shader_derivatives', 1, True), | |||
] | |||
@@ -0,0 +1,224 @@ | |||
/* | |||
* Copyright © 2018 Intel Corporation | |||
* | |||
* Permission is hereby granted, free of charge, to any person obtaining a | |||
* copy of this software and associated documentation files (the "Software"), | |||
* to deal in the Software without restriction, including without limitation | |||
* the rights to use, copy, modify, merge, publish, distribute, sublicense, | |||
* and/or sell copies of the Software, and to permit persons to whom the | |||
* Software is furnished to do so, subject to the following conditions: | |||
* | |||
* The above copyright notice and this permission notice (including the next | |||
* paragraph) shall be included in all copies or substantial portions of the | |||
* Software. | |||
* | |||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | |||
* DEALINGS IN THE SOFTWARE. | |||
*/ | |||
#include <assert.h> | |||
#include <stdbool.h> | |||
#include <stdint.h> | |||
#include "anv_private.h" | |||
#include "perf/gen_perf.h" | |||
#include "perf/gen_perf_mdapi.h" | |||
struct gen_perf_config * | |||
anv_get_perf(const struct gen_device_info *devinfo, int fd) | |||
{ | |||
struct gen_perf_config *perf = gen_perf_new(NULL); | |||
gen_perf_init_metrics(perf, devinfo, fd); | |||
/* We need DRM_I915_PERF_PROP_HOLD_PREEMPTION support, only available in | |||
* perf revision 2. | |||
*/ | |||
if (anv_gem_get_param(fd, I915_PARAM_PERF_REVISION) < 3) | |||
goto err; | |||
return perf; | |||
err: | |||
ralloc_free(perf); | |||
return NULL; | |||
} | |||
void | |||
anv_device_perf_init(struct anv_device *device) | |||
{ | |||
device->perf_fd = -1; | |||
} | |||
static int | |||
anv_device_perf_open(struct anv_device *device, uint64_t metric_id) | |||
{ | |||
uint64_t properties[DRM_I915_PERF_PROP_MAX * 2]; | |||
struct drm_i915_perf_open_param param; | |||
int p = 0, stream_fd; | |||
properties[p++] = DRM_I915_PERF_PROP_SAMPLE_OA; | |||
properties[p++] = true; | |||
properties[p++] = DRM_I915_PERF_PROP_OA_METRICS_SET; | |||
properties[p++] = metric_id; | |||
properties[p++] = DRM_I915_PERF_PROP_OA_FORMAT; | |||
properties[p++] = device->info.gen >= 8 ? | |||
I915_OA_FORMAT_A32u40_A4u32_B8_C8 : | |||
I915_OA_FORMAT_A45_B8_C8; | |||
properties[p++] = DRM_I915_PERF_PROP_OA_EXPONENT; | |||
properties[p++] = 31; /* slowest sampling period */ | |||
properties[p++] = DRM_I915_PERF_PROP_CTX_HANDLE; | |||
properties[p++] = device->context_id; | |||
properties[p++] = DRM_I915_PERF_PROP_HOLD_PREEMPTION; | |||
properties[p++] = true; | |||
memset(¶m, 0, sizeof(param)); | |||
param.flags = 0; | |||
param.flags |= I915_PERF_FLAG_FD_CLOEXEC | I915_PERF_FLAG_FD_NONBLOCK; | |||
param.properties_ptr = (uintptr_t)properties; | |||
param.num_properties = p / 2; | |||
stream_fd = gen_ioctl(device->fd, DRM_IOCTL_I915_PERF_OPEN, ¶m); | |||
return stream_fd; | |||
} | |||
VkResult anv_InitializePerformanceApiINTEL( | |||
VkDevice _device, | |||
const VkInitializePerformanceApiInfoINTEL* pInitializeInfo) | |||
{ | |||
ANV_FROM_HANDLE(anv_device, device, _device); | |||
const struct anv_physical_device *pdevice = &device->instance->physicalDevice; | |||
if (!pdevice->perf) | |||
return VK_ERROR_EXTENSION_NOT_PRESENT; | |||
/* Not much to do here */ | |||
return VK_SUCCESS; | |||
} | |||
VkResult anv_GetPerformanceParameterINTEL( | |||
VkDevice _device, | |||
VkPerformanceParameterTypeINTEL parameter, | |||
VkPerformanceValueINTEL* pValue) | |||
{ | |||
ANV_FROM_HANDLE(anv_device, device, _device); | |||
const struct anv_physical_device *pdevice = &device->instance->physicalDevice; | |||
if (!pdevice->perf) | |||
return VK_ERROR_EXTENSION_NOT_PRESENT; | |||
VkResult result = VK_SUCCESS; | |||
switch (parameter) { | |||
case VK_PERFORMANCE_PARAMETER_TYPE_HW_COUNTERS_SUPPORTED_INTEL: | |||
pValue->type = VK_PERFORMANCE_VALUE_TYPE_BOOL_INTEL; | |||
pValue->data.valueBool = VK_TRUE; | |||
break; | |||
case VK_PERFORMANCE_PARAMETER_TYPE_STREAM_MARKER_VALID_BITS_INTEL: | |||
pValue->type = VK_PERFORMANCE_VALUE_TYPE_UINT32_INTEL; | |||
pValue->data.value32 = 25; | |||
break; | |||
default: | |||
result = VK_ERROR_FEATURE_NOT_PRESENT; | |||
break; | |||
} | |||
return result; | |||
} | |||
VkResult anv_CmdSetPerformanceMarkerINTEL( | |||
VkCommandBuffer commandBuffer, | |||
const VkPerformanceMarkerInfoINTEL* pMarkerInfo) | |||
{ | |||
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); | |||
cmd_buffer->intel_perf_marker = pMarkerInfo->marker; | |||
return VK_SUCCESS; | |||
} | |||
VkResult anv_AcquirePerformanceConfigurationINTEL( | |||
VkDevice _device, | |||
const VkPerformanceConfigurationAcquireInfoINTEL* pAcquireInfo, | |||
VkPerformanceConfigurationINTEL* pConfiguration) | |||
{ | |||
ANV_FROM_HANDLE(anv_device, device, _device); | |||
const struct anv_physical_device *pdevice = &device->instance->physicalDevice; | |||
struct gen_perf_registers *perf_config = | |||
gen_perf_load_configuration(pdevice->perf, device->fd, | |||
GEN_PERF_QUERY_GUID_MDAPI); | |||
if (!perf_config) | |||
return VK_INCOMPLETE; | |||
int ret = gen_perf_store_configuration(pdevice->perf, device->fd, | |||
perf_config, NULL /* guid */); | |||
if (ret < 0) { | |||
ralloc_free(perf_config); | |||
return VK_INCOMPLETE; | |||
} | |||
*pConfiguration = (VkPerformanceConfigurationINTEL) (uint64_t) ret; | |||
return VK_SUCCESS; | |||
} | |||
VkResult anv_ReleasePerformanceConfigurationINTEL( | |||
VkDevice _device, | |||
VkPerformanceConfigurationINTEL _configuration) | |||
{ | |||
ANV_FROM_HANDLE(anv_device, device, _device); | |||
uint64_t config = (uint64_t) _configuration; | |||
gen_ioctl(device->fd, DRM_IOCTL_I915_PERF_REMOVE_CONFIG, &config); | |||
return VK_SUCCESS; | |||
} | |||
VkResult anv_QueueSetPerformanceConfigurationINTEL( | |||
VkQueue _queue, | |||
VkPerformanceConfigurationINTEL _configuration) | |||
{ | |||
ANV_FROM_HANDLE(anv_queue, queue, _queue); | |||
struct anv_device *device = queue->device; | |||
uint64_t configuration = (uint64_t) _configuration; | |||
if (device->perf_fd < 0) { | |||
device->perf_fd = anv_device_perf_open(device, configuration); | |||
if (device->perf_fd < 0) | |||
return VK_ERROR_INITIALIZATION_FAILED; | |||
} else { | |||
int ret = gen_ioctl(device->perf_fd, I915_PERF_IOCTL_CONFIG, | |||
(void *)(uintptr_t) _configuration); | |||
if (ret < 0) { | |||
return anv_device_set_lost(device, | |||
"i915-perf config failed: %s", | |||
strerror(ret)); | |||
} | |||
} | |||
return VK_SUCCESS; | |||
} | |||
void anv_UninitializePerformanceApiINTEL( | |||
VkDevice _device) | |||
{ | |||
ANV_FROM_HANDLE(anv_device, device, _device); | |||
if (device->perf_fd >= 0) { | |||
close(device->perf_fd); | |||
device->perf_fd = -1; | |||
} | |||
} |
@@ -74,6 +74,7 @@ struct anv_image_view; | |||
struct anv_instance; | |||
struct gen_l3_config; | |||
struct gen_perf_config; | |||
#include <vulkan/vulkan.h> | |||
#include <vulkan/vulkan_intel.h> | |||
@@ -948,6 +949,7 @@ struct anv_physical_device { | |||
bool supports_48bit_addresses; | |||
struct brw_compiler * compiler; | |||
struct isl_device isl_dev; | |||
struct gen_perf_config * perf; | |||
int cmd_parser_version; | |||
bool has_exec_async; | |||
bool has_exec_capture; | |||
@@ -1169,6 +1171,9 @@ struct anv_device { | |||
* the cmd_buffer's list. | |||
*/ | |||
struct anv_cmd_buffer *cmd_buffer_being_decoded; | |||
int perf_fd; /* -1 if no opened */ | |||
uint64_t perf_metric; /* 0 if unset */ | |||
}; | |||
static inline struct anv_state_pool * | |||
@@ -2530,6 +2535,9 @@ struct anv_cmd_buffer { | |||
VkCommandBufferLevel level; | |||
struct anv_cmd_state state; | |||
/* Set by SetPerformanceMarkerINTEL, written into queries by CmdBeginQuery */ | |||
uint64_t intel_perf_marker; | |||
}; | |||
VkResult anv_cmd_buffer_init_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer); | |||
@@ -3750,6 +3758,9 @@ anv_get_subpass_id(const struct anv_cmd_state * const cmd_state) | |||
return subpass_id; | |||
} | |||
struct gen_perf_config *anv_get_perf(const struct gen_device_info *devinfo, int fd); | |||
void anv_device_perf_init(struct anv_device *device); | |||
#define ANV_DEFINE_HANDLE_CASTS(__anv_type, __VkType) \ | |||
\ | |||
static inline struct __anv_type * \ |
@@ -5091,3 +5091,57 @@ void genX(CmdWaitEvents)( | |||
bufferMemoryBarrierCount, pBufferMemoryBarriers, | |||
imageMemoryBarrierCount, pImageMemoryBarriers); | |||
} | |||
VkResult genX(CmdSetPerformanceOverrideINTEL)( | |||
VkCommandBuffer commandBuffer, | |||
const VkPerformanceOverrideInfoINTEL* pOverrideInfo) | |||
{ | |||
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); | |||
switch (pOverrideInfo->type) { | |||
case VK_PERFORMANCE_OVERRIDE_TYPE_NULL_HARDWARE_INTEL: { | |||
uint32_t dw; | |||
#if GEN_GEN >= 9 | |||
anv_pack_struct(&dw, GENX(CS_DEBUG_MODE2), | |||
._3DRenderingInstructionDisable = pOverrideInfo->enable, | |||
.MediaInstructionDisable = pOverrideInfo->enable, | |||
._3DRenderingInstructionDisableMask = true, | |||
.MediaInstructionDisableMask = true); | |||
emit_lri(&cmd_buffer->batch, GENX(CS_DEBUG_MODE2_num), dw); | |||
#else | |||
anv_pack_struct(&dw, GENX(INSTPM), | |||
._3DRenderingInstructionDisable = pOverrideInfo->enable, | |||
.MediaInstructionDisable = pOverrideInfo->enable, | |||
._3DRenderingInstructionDisableMask = true, | |||
.MediaInstructionDisableMask = true); | |||
emit_lri(&cmd_buffer->batch, GENX(INSTPM_num), dw); | |||
#endif | |||
break; | |||
} | |||
case VK_PERFORMANCE_OVERRIDE_TYPE_FLUSH_GPU_CACHES_INTEL: | |||
if (pOverrideInfo->enable) { | |||
/* FLUSH ALL THE THINGS! As requested by the MDAPI team. */ | |||
cmd_buffer->state.pending_pipe_bits |= | |||
ANV_PIPE_FLUSH_BITS | | |||
ANV_PIPE_INVALIDATE_BITS; | |||
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); | |||
} | |||
break; | |||
default: | |||
unreachable("Invalid override"); | |||
} | |||
return VK_SUCCESS; | |||
} | |||
VkResult genX(CmdSetPerformanceStreamMarkerINTEL)( | |||
VkCommandBuffer commandBuffer, | |||
const VkPerformanceStreamMarkerInfoINTEL* pMarkerInfo) | |||
{ | |||
/* TODO: Waiting on the register to write, might depend on generation. */ | |||
return VK_SUCCESS; | |||
} |
@@ -37,6 +37,10 @@ | |||
#define __gen_get_batch_dwords anv_batch_emit_dwords | |||
#define __gen_address_offset anv_address_add | |||
#include "common/gen_mi_builder.h" | |||
#include "perf/gen_perf.h" | |||
#include "perf/gen_perf_mdapi.h" | |||
#define OA_REPORT_N_UINT64 (256 / sizeof(uint64_t)) | |||
VkResult genX(CreateQueryPool)( | |||
VkDevice _device, | |||
@@ -52,9 +56,14 @@ VkResult genX(CreateQueryPool)( | |||
assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO); | |||
/* Query pool slots are made up of some number of 64-bit values packed | |||
* tightly together. The first 64-bit value is always the "available" bit | |||
* which is 0 when the query is unavailable and 1 when it is available. | |||
* The 64-bit values that follow are determined by the type of query. | |||
* tightly together. For most query types have the first 64-bit value is | |||
* the "available" bit which is 0 when the query is unavailable and 1 when | |||
* it is available. The 64-bit values that follow are determined by the | |||
* type of query. | |||
* | |||
* For performance queries, we have a requirement to align OA reports at | |||
* 64bytes so we put those first and have the "available" bit behind | |||
* together with some other counters. | |||
*/ | |||
uint32_t uint64s_per_slot = 1; | |||
@@ -84,6 +93,15 @@ VkResult genX(CreateQueryPool)( | |||
*/ | |||
uint64s_per_slot += 4; | |||
break; | |||
case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: { | |||
uint64s_per_slot = 2 * OA_REPORT_N_UINT64; /* begin & end OA reports */ | |||
uint64s_per_slot += 4; /* PerfCounter 1 & 2 */ | |||
uint64s_per_slot++; /* 2 * 32bit RPSTAT register */ | |||
uint64s_per_slot++; /* 64bit marker */ | |||
uint64s_per_slot++; /* availability */ | |||
uint64s_per_slot = align_u32(uint64s_per_slot, 8); /* OA reports must be aligned to 64 bytes */ | |||
break; | |||
} | |||
default: | |||
assert(!"Invalid query type"); | |||
} | |||
@@ -160,6 +178,57 @@ anv_query_address(struct anv_query_pool *pool, uint32_t query) | |||
}; | |||
} | |||
/** | |||
* VK_INTEL_performance_query layout: | |||
* | |||
* ------------------------------ | |||
* | end MI_RPC (256b) | | |||
* |----------------------------| | |||
* | begin MI_RPC (256b) | | |||
* |----------------------------| | |||
* | begin perfcntr 1 & 2 (16b) | | |||
* |----------------------------| | |||
* | end perfcntr 1 & 2 (16b) | | |||
* |----------------------------| | |||
* | begin RPSTAT register (4b) | | |||
* |----------------------------| | |||
* | end RPSTAT register (4b) | | |||
* |----------------------------| | |||
* | marker (8b) | | |||
* |----------------------------| | |||
* | availability (8b) | | |||
* ------------------------------ | |||
*/ | |||
static uint32_t | |||
intel_perf_mi_rpc_offset(bool end) | |||
{ | |||
return end ? 0 : 256; | |||
} | |||
static uint32_t | |||
intel_perf_counter(bool end) | |||
{ | |||
uint32_t offset = 512; | |||
offset += end ? 2 * sizeof(uint64_t) : 0; | |||
return offset; | |||
} | |||
static uint32_t | |||
intel_perf_rpstart_offset(bool end) | |||
{ | |||
uint32_t offset = intel_perf_counter(false) + | |||
4 * sizeof(uint64_t); | |||
offset += end ? sizeof(uint32_t) : 0; | |||
return offset; | |||
} | |||
static uint32_t | |||
intel_perf_marker_offset(void) | |||
{ | |||
return intel_perf_rpstart_offset(false) + sizeof(uint64_t); | |||
} | |||
static void | |||
cpu_write_query_result(void *dst_slot, VkQueryResultFlags flags, | |||
uint32_t value_index, uint64_t result) | |||
@@ -173,18 +242,28 @@ cpu_write_query_result(void *dst_slot, VkQueryResultFlags flags, | |||
} | |||
} | |||
static void * | |||
query_slot(struct anv_query_pool *pool, uint32_t query) | |||
{ | |||
return pool->bo.map + query * pool->stride; | |||
} | |||
static bool | |||
query_is_available(uint64_t *slot) | |||
query_is_available(struct anv_query_pool *pool, uint32_t query) | |||
{ | |||
return *(volatile uint64_t *)slot; | |||
if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL) { | |||
return *(volatile uint64_t *)((uint8_t *)query_slot(pool, query) + | |||
pool->stride - 8); | |||
} else | |||
return *(volatile uint64_t *)query_slot(pool, query); | |||
} | |||
static VkResult | |||
wait_for_available(struct anv_device *device, | |||
struct anv_query_pool *pool, uint64_t *slot) | |||
struct anv_query_pool *pool, uint32_t query) | |||
{ | |||
while (true) { | |||
if (query_is_available(slot)) | |||
if (query_is_available(pool, query)) | |||
return VK_SUCCESS; | |||
int ret = anv_gem_busy(device, pool->bo.gem_handle); | |||
@@ -197,7 +276,7 @@ wait_for_available(struct anv_device *device, | |||
} else { | |||
assert(ret == 0); | |||
/* The BO is no longer busy. */ | |||
if (query_is_available(slot)) { | |||
if (query_is_available(pool, query)) { | |||
return VK_SUCCESS; | |||
} else { | |||
VkResult status = anv_device_query_status(device); | |||
@@ -233,7 +312,8 @@ VkResult genX(GetQueryPoolResults)( | |||
assert(pool->type == VK_QUERY_TYPE_OCCLUSION || | |||
pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS || | |||
pool->type == VK_QUERY_TYPE_TIMESTAMP || | |||
pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT); | |||
pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT || | |||
pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL); | |||
if (anv_device_is_lost(device)) | |||
return VK_ERROR_DEVICE_LOST; | |||
@@ -245,13 +325,10 @@ VkResult genX(GetQueryPoolResults)( | |||
VkResult status = VK_SUCCESS; | |||
for (uint32_t i = 0; i < queryCount; i++) { | |||
uint64_t *slot = pool->bo.map + (firstQuery + i) * pool->stride; | |||
/* Availability is always at the start of the slot */ | |||
bool available = slot[0]; | |||
bool available = query_is_available(pool, firstQuery + i); | |||
if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) { | |||
status = wait_for_available(device, pool, slot); | |||
status = wait_for_available(device, pool, firstQuery + i); | |||
if (status != VK_SUCCESS) | |||
return status; | |||
@@ -271,13 +348,16 @@ VkResult genX(GetQueryPoolResults)( | |||
uint32_t idx = 0; | |||
switch (pool->type) { | |||
case VK_QUERY_TYPE_OCCLUSION: | |||
case VK_QUERY_TYPE_OCCLUSION: { | |||
uint64_t *slot = query_slot(pool, firstQuery + i); | |||
if (write_results) | |||
cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]); | |||
idx++; | |||
break; | |||
} | |||
case VK_QUERY_TYPE_PIPELINE_STATISTICS: { | |||
uint64_t *slot = query_slot(pool, firstQuery + i); | |||
uint32_t statistics = pool->pipeline_statistics; | |||
while (statistics) { | |||
uint32_t stat = u_bit_scan(&statistics); | |||
@@ -297,7 +377,8 @@ VkResult genX(GetQueryPoolResults)( | |||
break; | |||
} | |||
case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: | |||
case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: { | |||
uint64_t *slot = query_slot(pool, firstQuery + i); | |||
if (write_results) | |||
cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]); | |||
idx++; | |||
@@ -305,12 +386,54 @@ VkResult genX(GetQueryPoolResults)( | |||
cpu_write_query_result(pData, flags, idx, slot[4] - slot[3]); | |||
idx++; | |||
break; | |||
} | |||
case VK_QUERY_TYPE_TIMESTAMP: | |||
case VK_QUERY_TYPE_TIMESTAMP: { | |||
uint64_t *slot = query_slot(pool, firstQuery + i); | |||
if (write_results) | |||
cpu_write_query_result(pData, flags, idx, slot[1]); | |||
idx++; | |||
break; | |||
} | |||
case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: { | |||
if (!write_results) | |||
break; | |||
const void *query_data = query_slot(pool, firstQuery + i); | |||
const uint32_t *oa_begin = query_data + intel_perf_mi_rpc_offset(false); | |||
const uint32_t *oa_end = query_data + intel_perf_mi_rpc_offset(true); | |||
const uint32_t *rpstat_begin = query_data + intel_perf_rpstart_offset(false); | |||
const uint32_t *rpstat_end = query_data + intel_perf_mi_rpc_offset(true); | |||
struct gen_perf_query_result result; | |||
struct gen_perf_query_info metric = { | |||
.oa_format = (GEN_GEN >= 8 ? | |||
I915_OA_FORMAT_A32u40_A4u32_B8_C8 : | |||
I915_OA_FORMAT_A45_B8_C8), | |||
}; | |||
uint32_t core_freq[2]; | |||
#if GEN_GEN < 9 | |||
core_freq[0] = ((*rpstat_begin >> 7) & 0x7f) * 1000000ULL; | |||
core_freq[1] = ((*rpstat_end >> 7) & 0x7f) * 1000000ULL; | |||
#else | |||
core_freq[0] = ((*rpstat_begin >> 23) & 0x1ff) * 1000000ULL; | |||
core_freq[1] = ((*rpstat_end >> 23) & 0x1ff) * 1000000ULL; | |||
#endif | |||
gen_perf_query_result_clear(&result); | |||
gen_perf_query_result_accumulate(&result, &metric, | |||
oa_begin, oa_end); | |||
gen_perf_query_result_read_frequencies(&result, &device->info, | |||
oa_begin, oa_end); | |||
gen_perf_query_result_write_mdapi(pData, stride, | |||
&device->info, | |||
&result, | |||
core_freq[0], core_freq[1]); | |||
gen_perf_query_mdapi_write_perfcntr(pData, stride, &device->info, | |||
query_data + intel_perf_counter(false), | |||
query_data + intel_perf_counter(true)); | |||
const uint64_t *marker = query_data + intel_perf_marker_offset(); | |||
gen_perf_query_mdapi_write_marker(pData, stride, &device->info, *marker); | |||
break; | |||
} | |||
default: | |||
unreachable("invalid pool type"); | |||
@@ -406,6 +529,16 @@ emit_zero_queries(struct anv_cmd_buffer *cmd_buffer, | |||
} | |||
break; | |||
case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: | |||
for (uint32_t i = 0; i < num_queries; i++) { | |||
struct anv_address slot_addr = | |||
anv_query_address(pool, first_index + i); | |||
gen_mi_memset(b, slot_addr, 0, pool->stride - 8); | |||
emit_query_mi_availability(b, anv_address_add(slot_addr, | |||
pool->stride - 8), true); | |||
} | |||
break; | |||
default: | |||
unreachable("Unsupported query type"); | |||
} | |||
@@ -440,6 +573,21 @@ void genX(CmdResetQueryPool)( | |||
break; | |||
} | |||
case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: { | |||
struct gen_mi_builder b; | |||
gen_mi_builder_init(&b, &cmd_buffer->batch); | |||
for (uint32_t i = 0; i < queryCount; i++) { | |||
emit_query_mi_availability( | |||
&b, | |||
anv_address_add( | |||
anv_query_address(pool, firstQuery + i), | |||
pool->stride - 8), | |||
false); | |||
} | |||
break; | |||
} | |||
default: | |||
unreachable("Unsupported query type"); | |||
} | |||
@@ -550,6 +698,37 @@ void genX(CmdBeginQueryIndexedEXT)( | |||
emit_xfb_query(&b, index, anv_address_add(query_addr, 8)); | |||
break; | |||
case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: { | |||
anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { | |||
pc.CommandStreamerStallEnable = true; | |||
pc.StallAtPixelScoreboard = true; | |||
} | |||
anv_batch_emit(&cmd_buffer->batch, GENX(MI_REPORT_PERF_COUNT), rpc) { | |||
rpc.MemoryAddress = | |||
anv_address_add(query_addr, intel_perf_mi_rpc_offset(false)); | |||
} | |||
#if GEN_GEN < 9 | |||
gen_mi_store(&b, | |||
gen_mi_mem32(anv_address_add(query_addr, | |||
intel_perf_rpstart_offset(false))), | |||
gen_mi_reg32(GENX(RPSTAT1_num))); | |||
#else | |||
gen_mi_store(&b, | |||
gen_mi_mem32(anv_address_add(query_addr, | |||
intel_perf_rpstart_offset(false))), | |||
gen_mi_reg32(GENX(RPSTAT0_num))); | |||
#endif | |||
#if GEN_GEN >= 8 && GEN_GEN <= 11 | |||
gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, | |||
intel_perf_counter(false))), | |||
gen_mi_reg64(GENX(PERFCNT1_num))); | |||
gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, | |||
intel_perf_counter(false) + 8)), | |||
gen_mi_reg64(GENX(PERFCNT2_num))); | |||
#endif | |||
break; | |||
} | |||
default: | |||
unreachable(""); | |||
} | |||
@@ -611,6 +790,45 @@ void genX(CmdEndQueryIndexedEXT)( | |||
emit_query_mi_availability(&b, query_addr, true); | |||
break; | |||
case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: { | |||
anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { | |||
pc.CommandStreamerStallEnable = true; | |||
pc.StallAtPixelScoreboard = true; | |||
} | |||
uint32_t marker_offset = intel_perf_marker_offset(); | |||
gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, marker_offset)), | |||
gen_mi_imm(cmd_buffer->intel_perf_marker)); | |||
#if GEN_GEN >= 8 && GEN_GEN <= 11 | |||
gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, intel_perf_counter(true))), | |||
gen_mi_reg64(GENX(PERFCNT1_num))); | |||
gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, intel_perf_counter(true) + 8)), | |||
gen_mi_reg64(GENX(PERFCNT2_num))); | |||
#endif | |||
#if GEN_GEN < 9 | |||
gen_mi_store(&b, | |||
gen_mi_mem32(anv_address_add(query_addr, | |||
intel_perf_rpstart_offset(true))), | |||
gen_mi_reg32(GENX(RPSTAT1_num))); | |||
#else | |||
gen_mi_store(&b, | |||
gen_mi_mem32(anv_address_add(query_addr, | |||
intel_perf_rpstart_offset(true))), | |||
gen_mi_reg32(GENX(RPSTAT0_num))); | |||
#endif | |||
/* Position the last OA snapshot at the beginning of the query so that | |||
* we can tell whether it's ready. | |||
*/ | |||
anv_batch_emit(&cmd_buffer->batch, GENX(MI_REPORT_PERF_COUNT), rpc) { | |||
rpc.MemoryAddress = anv_address_add(query_addr, | |||
intel_perf_mi_rpc_offset(true)); | |||
rpc.ReportID = 0xdeadbeef; /* This goes in the first dword */ | |||
} | |||
emit_query_mi_availability(&b, | |||
anv_address_add(query_addr, pool->stride - 8), | |||
true); | |||
break; | |||
} | |||
default: | |||
unreachable(""); | |||
} |
@@ -118,6 +118,7 @@ libanv_files = files( | |||
'anv_nir_lower_push_constants.c', | |||
'anv_nir_lower_ycbcr_textures.c', | |||
'anv_pass.c', | |||
'anv_perf.c', | |||
'anv_pipeline.c', | |||
'anv_pipeline_cache.c', | |||
'anv_private.h', | |||
@@ -194,6 +195,7 @@ libvulkan_intel = shared_library( | |||
link_whole : [libanv_common, libanv_gen_libs], | |||
link_with : [ | |||
libintel_compiler, libintel_dev, libisl, libblorp, libvulkan_wsi, | |||
libintel_perf, | |||
], | |||
dependencies : [ | |||
dep_thread, dep_dl, dep_m, anv_deps, idep_libintel_common, | |||
@@ -227,7 +229,7 @@ if with_tests | |||
link_whole : libanv_common, | |||
link_with : [ | |||
libanv_gen_libs, libintel_compiler, libintel_common, libintel_dev, | |||
libisl, libblorp, libvulkan_wsi, | |||
libisl, libblorp, libvulkan_wsi, libintel_perf, | |||
], | |||
dependencies : [ | |||
dep_thread, dep_dl, dep_m, anv_deps, |