From 8dfc9f038ee3f6a57f0a3f3cc641b0866a6111b7 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Wed, 16 Oct 2013 11:51:22 -0700
Subject: i965/fs: Use the gen7 scratch read opcode when possible.

This avoids a lot of message setup we had to do otherwise.  Improves
GLB2.7 performance with register spilling force enabled by 1.6442% +/-
0.553218% (n=4).

v2: Use BRW_PREDICATE_NONE, improve a comment (by Paul).

Reviewed-by: Paul Berry <stereotype441@gmail.com>
---
 src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp')

diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
index 5093dd5..9e4de29 100644
--- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
@@ -342,6 +342,18 @@ schedule_node::set_latency_gen7(bool is_haswell)
       latency = 200;
       break;
 
+   case SHADER_OPCODE_GEN7_SCRATCH_READ:
+      /* Testing a load from offset 0, that had been previously written:
+       *
+       * send(8) g114<1>UW g0<8,8,1>F data (0, 0, 0) mlen 1 rlen 1 { align1 WE_normal 1Q };
+       * mov(8)  null      g114<8,8,1>F { align1 WE_normal 1Q };
+       *
+       * The cycles spent seemed to be grouped around 40-50 (as low as 38),
+       * then around 140.  Presumably this is cache hit vs miss.
+       */
+      latency = 50;
+      break;
+
    default:
       /* 2 cycles:
        * mul(8) g4<1>F g2<0,1,0>F      0.5F            { align1 WE_normal 1Q };
-- 
cgit v1.1