/*
 * Copyright © 2015 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */

#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <unistd.h>
#include <assert.h>
#include <sys/types.h>
#include <sys/mman.h>

#include <drm.h>
#include <i915_drm.h>

#include "private.h"
#include "aub.h"

struct anv_aub_writer {
   FILE *file;
   uint32_t offset;
   int gen;
};

static void
aub_out(struct anv_aub_writer *writer, uint32_t data)
{
   fwrite(&data, 1, 4, writer->file);
}

static void
aub_out_data(struct anv_aub_writer *writer, const void *data, size_t size)
{
   fwrite(data, 1, size, writer->file);
}

static struct anv_aub_writer *
get_anv_aub_writer(struct anv_device *device)
{
   struct anv_aub_writer *writer = device->aub_writer;
   int entry = 0x200003;
   int i;
   int gtt_size = 0x10000;
   const char *filename;

   if (geteuid() != getuid())
      return NULL;

   if (writer)
      return writer;

   writer = malloc(sizeof(*writer));
   if (writer == NULL)
      return NULL;

   filename = "intel.aub";
   writer->gen = device->info.gen;
   writer->file = fopen(filename, "w+");
   if (!writer->file) {
      free(writer);
      return NULL;
   }

   /* Start allocating objects from just after the GTT. */
   writer->offset = gtt_size;

   /* Start with a (required) version packet. */
   aub_out(writer, CMD_AUB_HEADER | (13 - 2));
   aub_out(writer,
           (4 << AUB_HEADER_MAJOR_SHIFT) |
           (0 << AUB_HEADER_MINOR_SHIFT));
   for (i = 0; i < 8; i++) {
      aub_out(writer, 0); /* app name */
   }
   aub_out(writer, 0); /* timestamp */
   aub_out(writer, 0); /* timestamp */
   aub_out(writer, 0); /* comment len */

   /* Set up the GTT. The max we can handle is 256M */
   aub_out(writer, CMD_AUB_TRACE_HEADER_BLOCK | ((writer->gen >= 8 ? 6 : 5) - 2));
   aub_out(writer,
           AUB_TRACE_MEMTYPE_GTT_ENTRY |
           AUB_TRACE_TYPE_NOTYPE | AUB_TRACE_OP_DATA_WRITE);
   aub_out(writer, 0); /* subtype */
   aub_out(writer, 0); /* offset */
   aub_out(writer, gtt_size); /* size */
   if (writer->gen >= 8)
      aub_out(writer, 0);
   for (i = 0x000; i < gtt_size; i += 4, entry += 0x1000) {
      aub_out(writer, entry);
   }

   return device->aub_writer = writer;
}

void
anv_aub_writer_destroy(struct anv_aub_writer *writer)
{
   fclose(writer->file);
   free(writer);
}


/**
 * Break up large objects into multiple writes.  Otherwise a 128kb VBO
 * would overflow the 16 bits of size field in the packet header and
 * everything goes badly after that.
 */
static void
aub_write_trace_block(struct anv_aub_writer *writer, uint32_t type,
                      void *virtual, uint32_t size, uint32_t gtt_offset)
{
   uint32_t block_size;
   uint32_t offset;
   uint32_t subtype = 0;
   static const char null_block[8 * 4096];

   for (offset = 0; offset < size; offset += block_size) {
      block_size = size - offset;

      if (block_size > 8 * 4096)
         block_size = 8 * 4096;

      aub_out(writer,
              CMD_AUB_TRACE_HEADER_BLOCK |
              ((writer->gen >= 8 ? 6 : 5) - 2));
      aub_out(writer,
              AUB_TRACE_MEMTYPE_GTT |
              type | AUB_TRACE_OP_DATA_WRITE);
      aub_out(writer, subtype);
      aub_out(writer, gtt_offset + offset);
      aub_out(writer, ALIGN_U32(block_size, 4));
      if (writer->gen >= 8)
         aub_out(writer, 0);

      if (virtual)
         aub_out_data(writer, (char *) virtual + offset, block_size);
      else
         aub_out_data(writer, null_block, block_size);

      /* Pad to a multiple of 4 bytes. */
      aub_out_data(writer, null_block, -block_size & 3);
   }
}

/*
 * Make a ringbuffer on fly and dump it
 */
static void
aub_build_dump_ringbuffer(struct anv_aub_writer *writer,
                          uint32_t batch_offset, uint32_t offset,
                          int ring_flag)
{
   uint32_t ringbuffer[4096];
   int ring = AUB_TRACE_TYPE_RING_PRB0; /* The default ring */
   int ring_count = 0;

   if (ring_flag == I915_EXEC_BSD)
      ring = AUB_TRACE_TYPE_RING_PRB1;
   else if (ring_flag == I915_EXEC_BLT)
      ring = AUB_TRACE_TYPE_RING_PRB2;

   /* Make a ring buffer to execute our batchbuffer. */
   memset(ringbuffer, 0, sizeof(ringbuffer));
   if (writer->gen >= 8) {
      ringbuffer[ring_count++] = AUB_MI_BATCH_BUFFER_START | (3 - 2);
      ringbuffer[ring_count++] = batch_offset;
      ringbuffer[ring_count++] = 0;
   } else {
      ringbuffer[ring_count++] = AUB_MI_BATCH_BUFFER_START;
      ringbuffer[ring_count++] = batch_offset;
   }

   /* Write out the ring.  This appears to trigger execution of
    * the ring in the simulator.
    */
   aub_out(writer,
           CMD_AUB_TRACE_HEADER_BLOCK |
           ((writer->gen >= 8 ? 6 : 5) - 2));
   aub_out(writer,
           AUB_TRACE_MEMTYPE_GTT | ring | AUB_TRACE_OP_COMMAND_WRITE);
   aub_out(writer, 0); /* general/surface subtype */
   aub_out(writer, offset);
   aub_out(writer, ring_count * 4);
   if (writer->gen >= 8)
      aub_out(writer, 0);

   /* FIXME: Need some flush operations here? */
   aub_out_data(writer, ringbuffer, ring_count * 4);
}

struct aub_bo {
   uint32_t offset;
   void *map;
   void *relocated;
};

static void
relocate_bo(struct anv_bo *bo, struct drm_i915_gem_relocation_entry *relocs,
            size_t num_relocs, struct aub_bo *bos)
{
   struct aub_bo *aub_bo = &bos[bo->index];
   struct drm_i915_gem_relocation_entry *reloc;
   uint32_t *dw;

   aub_bo->relocated = malloc(bo->size);
   memcpy(aub_bo->relocated, aub_bo->map, bo->size);
   for (size_t i = 0; i < num_relocs; i++) {
      reloc = &relocs[i];
      assert(reloc->offset < bo->size);
      dw = aub_bo->relocated + reloc->offset;
      *dw = bos[reloc->target_handle].offset + reloc->delta;
   }
}

void
anv_cmd_buffer_dump(struct anv_cmd_buffer *cmd_buffer)
{
   struct anv_device *device = cmd_buffer->device;
   struct anv_batch *batch = &cmd_buffer->batch;
   struct anv_aub_writer *writer;
   struct anv_bo *bo;
   uint32_t ring_flag = 0;
   uint32_t offset;
   struct aub_bo *aub_bos;

   writer = get_anv_aub_writer(device);
   if (writer == NULL)
      return;

   aub_bos = malloc(cmd_buffer->bo_count * sizeof(aub_bos[0]));
   offset = writer->offset;
   for (uint32_t i = 0; i < cmd_buffer->bo_count; i++) {
      bo = cmd_buffer->exec2_bos[i];
      if (bo->map)
         aub_bos[i].map = bo->map;
      else
         aub_bos[i].map = anv_gem_mmap(device, bo->gem_handle, 0, bo->size);
      aub_bos[i].relocated = aub_bos[i].map;
      aub_bos[i].offset = offset;
      offset = ALIGN_U32(offset + bo->size + 4095, 4096);
   }

   struct anv_batch_bo *first_bbo;
   for (struct anv_batch_bo *bbo = cmd_buffer->last_batch_bo;
        bbo != NULL; bbo = bbo->prev_batch_bo) {
      /* Keep stashing the current BO until we get to the beginning */
      first_bbo = bbo;

      /* Handle relocations for this batch BO */
      relocate_bo(&bbo->bo, &batch->relocs.relocs[bbo->first_reloc],
                  bbo->num_relocs, aub_bos);
   }
   assert(first_bbo->prev_batch_bo == NULL);

   for (struct anv_batch_bo *bbo = cmd_buffer->surface_batch_bo;
        bbo != NULL; bbo = bbo->prev_batch_bo) {

      /* Handle relocations for this surface state BO */
      relocate_bo(&bbo->bo,
                  &cmd_buffer->surface_relocs.relocs[bbo->first_reloc],
                  bbo->num_relocs, aub_bos);
   }

   for (uint32_t i = 0; i < cmd_buffer->bo_count; i++) {
      bo = cmd_buffer->exec2_bos[i];
      if (i == cmd_buffer->bo_count - 1) {
         assert(bo == &first_bbo->bo);
         aub_write_trace_block(writer, AUB_TRACE_TYPE_BATCH,
                               aub_bos[i].relocated,
                               first_bbo->length, aub_bos[i].offset);
      } else {
         aub_write_trace_block(writer, AUB_TRACE_TYPE_NOTYPE,
                               aub_bos[i].relocated,
                               bo->size, aub_bos[i].offset);
      }
      if (aub_bos[i].relocated != aub_bos[i].map)
         free(aub_bos[i].relocated);
      if (aub_bos[i].map != bo->map)
         anv_gem_munmap(aub_bos[i].map, bo->size);
   }

   /* Dump ring buffer */
   aub_build_dump_ringbuffer(writer, aub_bos[first_bbo->bo.index].offset,
                             offset, ring_flag);

   free(aub_bos);

   fflush(writer->file);
}