23 files changed, 662 insertions, 121 deletions
diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
index 182b432..3847f91 100644
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -88,6 +88,7 @@ i965_C_FILES = \
 	gen6_clip_state.c \
 	gen6_depthstencil.c \
 	gen6_gs_state.c \
+        gen6_multisample_state.c \
 	gen6_sampler_state.c \
 	gen6_scissor_state.c \
 	gen6_sf_state.c \
diff --git a/src/mesa/drivers/dri/i965/brw_blorp.cpp b/src/mesa/drivers/dri/i965/brw_blorp.cpp
index 762d735..8e22511 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp.cpp
+++ b/src/mesa/drivers/dri/i965/brw_blorp.cpp
@@ -36,7 +36,8 @@ brw_blorp_mip_info::brw_blorp_mip_info()
 }
 
 brw_blorp_surface_info::brw_blorp_surface_info()
-   : map_stencil_as_y_tiled(false)
+   : map_stencil_as_y_tiled(false),
+     num_samples(0)
 {
 }
 
@@ -60,11 +61,15 @@ brw_blorp_surface_info::set(struct intel_mipmap_tree *mt,
    if (mt->format == MESA_FORMAT_S8) {
       /* The miptree is a W-tiled stencil buffer.  Surface states can't be set
        * up for W tiling, so we'll need to use Y tiling and have the WM
-       * program swizzle the coordinates.
+       * program swizzle the coordinates.  Furthermore, we need to set up the
+       * surface state as single-sampled, because the memory layout of related
+       * samples doesn't match between W and Y tiling.
        */
       this->map_stencil_as_y_tiled = true;
+      this->num_samples = 0;
    } else {
       this->map_stencil_as_y_tiled = false;
+      this->num_samples = mt->num_samples;
    }
 }
 
@@ -88,6 +93,7 @@ brw_blorp_params::brw_blorp_params()
      y1(0),
      depth_format(0),
      hiz_op(GEN6_HIZ_OP_NONE),
+     num_samples(0),
      use_wm_prog(false)
 {
 }
diff --git a/src/mesa/drivers/dri/i965/brw_blorp.h b/src/mesa/drivers/dri/i965/brw_blorp.h
index b6b659d..f14a5c7 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp.h
+++ b/src/mesa/drivers/dri/i965/brw_blorp.h
@@ -97,6 +97,8 @@ public:
     * width and height of the buffer.
     */
    bool map_stencil_as_y_tiled;
+
+   unsigned num_samples;
 };
 
 
@@ -151,6 +153,7 @@ public:
    brw_blorp_surface_info src;
    brw_blorp_surface_info dst;
    enum gen6_hiz_op hiz_op;
+   unsigned num_samples;
    bool use_wm_prog;
    brw_blorp_wm_push_constants wm_push_consts;
 };
@@ -177,16 +180,39 @@ public:
 
 struct brw_blorp_blit_prog_key
 {
+   /* Number of samples per pixel that have been configured in the surface
+    * state for texturing from.
+    */
+   unsigned tex_samples;
+
+   /* Actual number of samples per pixel in the source image. */
+   unsigned src_samples;
+
+   /* Number of samples per pixel that have been configured in the render
+    * target.
+    */
+   unsigned rt_samples;
+
+   /* Actual number of samples per pixel in the destination image. */
+   unsigned dst_samples;
+
    /* True if the source image is W tiled.  If true, the surface state for the
-    * source image must be configured as Y tiled.
+    * source image must be configured as Y tiled, and tex_samples must be 0.
     */
    bool src_tiled_w;
 
    /* True if the destination image is W tiled.  If true, the surface state
-    * for the render target must be configured as Y tiled.
+    * for the render target must be configured as Y tiled, and rt_samples must
+    * be 0.
     */
    bool dst_tiled_w;
 
+   /* True if all source samples should be blended together to produce each
+    * destination pixel.  If true, src_tiled_w must be false, tex_samples must
+    * equal src_samples, and tex_samples must be nonzero.
+    */
+   bool blend;
+
    /* True if the rectangle being sent through the rendering pipeline might be
     * larger than the destination rectangle, so the WM program should kill any
     * pixels that are outside the destination rectangle.
diff --git a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
index cce5d1b..1f0c318 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
+++ b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
@@ -215,11 +215,29 @@ brw_blorp_framebuffer(struct intel_context *intel,
  *
  * The bulk of the work done by the WM program is to wrap and unwrap the
  * coordinate transformations used by the hardware to store surfaces in
- * memory.  The hardware transforms a pixel location (X, Y) to a memory offset
- * by the following formulas:
+ * memory.  The hardware transforms a pixel location (X, Y, S) (where S is the
+ * sample index for a multisampled surface) to a memory offset by the
+ * following formulas:
  *
- *   offset = tile(tiling_format, X, Y)
- *   (X, Y) = detile(tiling_format, offset)
+ *   offset = tile(tiling_format, encode_msaa(num_samples, X, Y, S))
+ *   (X, Y, S) = decode_msaa(num_samples, detile(tiling_format, offset))
+ *
+ * For a single-sampled surface, encode_msaa() and decode_msaa are the
+ * identity function:
+ *
+ *   encode_msaa(1, X, Y, 0) = (X, Y)
+ *   decode_msaa(1, X, Y) = (X, Y, 0)
+ *
+ * For a 4x multisampled surface, encode_msaa() embeds the sample number into
+ * bit 1 of the X and Y coordinates:
+ *
+ *   encode_msaa(4, X, Y, S) = (X', Y')
+ *     where X' = (X & ~0b1) << 1 | (S & 0b1) << 1 | (X & 0b1)
+ *           Y' = (Y & ~0b1 ) << 1 | (S & 0b10) | (Y & 0b1)
+ *   decode_msaa(4, X, Y) = (X', Y', S)
+ *     where X' = (X & ~0b11) >> 1 | (X & 0b1)
+ *           Y' = (Y & ~0b11) >> 1 | (Y & 0b1)
+ *           S = (Y & 0b10) | (X & 0b10) >> 1
  *
  * For X tiling, tile() combines together the low-order bits of the X and Y
  * coordinates in the pattern 0byyyxxxxxxxxx, creating 4k tiles that are 512
@@ -239,7 +257,7 @@ brw_blorp_framebuffer(struct intel_context *intel,
  *                | (A & 0b111111111)
  *
  * (In all tiling formulas, cpp is the number of bytes occupied by a single
- * pixel ("chars per pixel"), and tile_pitch is the number of 4k tiles
+ * sample ("chars per pixel"), and tile_pitch is the number of 4k tiles
  * required to fill the width of the surface).
  *
  * For Y tiling, tile() combines together the low-order bits of the X and Y
@@ -301,7 +319,7 @@ brw_blorp_framebuffer(struct intel_context *intel,
  *           X' = A % pitch
  *
  * (In these formulas, pitch is the number of bytes occupied by a single row
- * of pixels).
+ * of samples).
  */
 class brw_blorp_blit_program
 {
@@ -319,8 +337,12 @@ private:
    void alloc_push_const_regs(int base_reg);
    void compute_frag_coords();
    void translate_tiling(bool old_tiled_w, bool new_tiled_w);
+   void encode_msaa(unsigned num_samples);
+   void decode_msaa(unsigned num_samples);
    void kill_if_outside_dst_rect();
    void translate_dst_to_src();
+   void single_to_blend();
+   void sample();
    void texel_fetch();
    void texture_lookup(GLuint msg_type,
                        struct brw_reg mrf_u, struct brw_reg mrf_v);
@@ -364,6 +386,14 @@ private:
     */
    int xy_coord_index;
 
+   /* True if, at the point in the program currently being compiled, the
+    * sample index is known to be zero.
+    */
+   bool s_is_zero;
+
+   /* Register storing the sample index when s_is_zero is false. */
+   struct brw_reg sample_index;
+
    /* Temporaries */
    struct brw_reg t1;
    struct brw_reg t2;
@@ -395,6 +425,37 @@ const GLuint *
 brw_blorp_blit_program::compile(struct brw_context *brw,
                                 GLuint *program_size)
 {
+   /* Sanity checks */
+   if (key->src_tiled_w) {
+      /* If the source image is W tiled, then tex_samples must be 0.
+       * Otherwise, after conversion between W and Y tiling, there's no
+       * guarantee that the sample index will be 0.
+       */
+      assert(key->tex_samples == 0);
+   }
+
+   if (key->dst_tiled_w) {
+      /* If the destination image is W tiled, then dst_samples must be 0.
+       * Otherwise, after conversion between W and Y tiling, there's no
+       * guarantee that all samples corresponding to a single pixel will still
+       * be together.
+       */
+      assert(key->rt_samples == 0);
+   }
+
+   if (key->blend) {
+      /* We are blending, which means we'll be using a SAMPLE message, which
+       * causes the hardware to pick up the all of the samples corresponding
+       * to this pixel and average them together.  Since we'll be relying on
+       * the hardware to find all of the samples and combine them together,
+       * the surface state for the texture must be configured with the correct
+       * tiling and sample count.
+       */
+      assert(!key->src_tiled_w);
+      assert(key->tex_samples == key->src_samples);
+      assert(key->tex_samples > 0);
+   }
+
    brw_set_compression_control(&func, BRW_COMPRESSION_NONE);
 
    alloc_regs();
@@ -405,22 +466,29 @@ brw_blorp_blit_program::compile(struct brw_context *brw,
    const bool tex_tiled_w = false;
 
    /* The address that data will be written to is determined by the
-    * coordinates supplied to the WM thread and the tiling of the render
-    * target, according to the formula:
+    * coordinates supplied to the WM thread and the tiling and sample count of
+    * the render target, according to the formula:
     *
-    * (X, Y) = detile(rt_tiling, offset)
+    * (X, Y, S) = decode_msaa(rt_samples, detile(rt_tiling, offset))
     *
-    * If the actual tiling of the destination surface is not the same as the
-    * configuration of the render target, then these coordinates are wrong and
-    * we have to adjust them to compensate for the difference.
+    * If the actual tiling and sample count of the destination surface are not
+    * the same as the configuration of the render target, then these
+    * coordinates are wrong and we have to adjust them to compensate for the
+    * difference.
     */
-   if (rt_tiled_w != key->dst_tiled_w)
+   if (rt_tiled_w != key->dst_tiled_w ||
+       key->rt_samples != key->dst_samples) {
+      encode_msaa(key->rt_samples);
+      /* Now (X, Y) = detile(rt_tiling, offset) */
       translate_tiling(rt_tiled_w, key->dst_tiled_w);
+      /* Now (X, Y) = detile(dst_tiling, offset) */
+      decode_msaa(key->dst_samples);
+   }
 
-   /* Now (X, Y) = detile(dst_tiling, offset).
+   /* Now (X, Y, S) = decode_msaa(dst_samples, detile(dst_tiling, offset)).
     *
-    * That is: X and Y now contain the true coordinates of the data that the
-    * WM thread should output.
+    * That is: X, Y and S now contain the true coordinates and sample index of
+    * the data that the WM thread should output.
     *
     * If we need to kill pixels that are outside the destination rectangle,
     * now is the time to do it.
@@ -432,31 +500,50 @@ brw_blorp_blit_program::compile(struct brw_context *brw,
    /* Next, apply a translation to obtain coordinates in the source image. */
    translate_dst_to_src();
 
-   /* X and Y are now the coordinates of the pixel in the source image that we
-    * want to texture from.
-    *
-    * The address that we want to fetch from is
-    * related to the X and Y values according to the formula:
-    *
-    * (X, Y) = detile(src_tiling, offset).
-    *
-    * If the actual tiling of the source surface is not the same as the
-    * configuration of the texture, then we need to adjust the coordinates to
-    * compensate for the difference.
+   /* If the source image is not multisampled, then we want to fetch sample
+    * number 0, because that's the only sample there is.
     */
-   if (tex_tiled_w != key->src_tiled_w)
-      translate_tiling(key->src_tiled_w, tex_tiled_w);
+   if (key->src_samples == 0)
+      s_is_zero = true;
 
-   /* Now (X, Y) = detile(tex_tiling, offset).
-    *
-    * In other words: X and Y now contain values which, when passed to
-    * the texturing unit, will cause data to be read from the correct
-    * memory location.  So we can fetch the texel now.
+   /* X, Y, and S are now the coordinates of the pixel in the source image
+    * that we want to texture from.  Exception: if we are blending, then S is
+    * irrelevant, because we are going to fetch all samples.
     */
-   texel_fetch();
+   if (key->blend) {
+      single_to_blend();
+      sample();
+   } else {
+      /* We aren't blending, which means we just want to fetch a single sample
+       * from the source surface.  The address that we want to fetch from is
+       * related to the X, Y and S values according to the formula:
+       *
+       * (X, Y, S) = decode_msaa(src_samples, detile(src_tiling, offset)).
+       *
+       * If the actual tiling and sample count of the source surface are not
+       * the same as the configuration of the texture, then we need to adjust
+       * the coordinates to compensate for the difference.
+       */
+      if (tex_tiled_w != key->src_tiled_w ||
+          key->tex_samples != key->src_samples) {
+         encode_msaa(key->src_samples);
+         /* Now (X, Y) = detile(src_tiling, offset) */
+         translate_tiling(key->src_tiled_w, tex_tiled_w);
+         /* Now (X, Y) = detile(tex_tiling, offset) */
+         decode_msaa(key->tex_samples);
+      }
 
-   /* Finally, write the fetched value to the render target and terminate the
-    * thread.
+      /* Now (X, Y, S) = decode_msaa(tex_samples, detile(tex_tiling, offset)).
+       *
+       * In other words: X, Y, and S now contain values which, when passed to
+       * the texturing unit, will cause data to be read from the correct
+       * memory location.  So we can fetch the texel now.
+       */
+      texel_fetch();
+   }
+
+   /* Finally, write the fetched (or blended) value to the render target and
+    * terminate the thread.
     */
    render_target_write();
    return brw_get_program(&func, program_size);
@@ -499,6 +586,8 @@ brw_blorp_blit_program::alloc_regs()
          = vec16(retype(brw_vec8_grf(reg++, 0), BRW_REGISTER_TYPE_UW));
    }
    this->xy_coord_index = 0;
+   this->sample_index
+      = vec16(retype(brw_vec8_grf(reg++, 0), BRW_REGISTER_TYPE_UW));
    this->t1 = vec16(retype(brw_vec8_grf(reg++, 0), BRW_REGISTER_TYPE_UW));
    this->t2 = vec16(retype(brw_vec8_grf(reg++, 0), BRW_REGISTER_TYPE_UW));
 
@@ -511,11 +600,14 @@ brw_blorp_blit_program::alloc_regs()
 /* In the code that follows, X and Y can be used to quickly refer to the
  * active elements of x_coords and y_coords, and Xp and Yp ("X prime" and "Y
  * prime") to the inactive elements.
+ *
+ * S can be used to quickly refer to sample_index.
  */
 #define X x_coords[xy_coord_index]
 #define Y y_coords[xy_coord_index]
 #define Xp x_coords[!xy_coord_index]
 #define Yp y_coords[!xy_coord_index]
+#define S sample_index
 
 /* Quickly swap the roles of (X, Y) and (Xp, Yp).  Saves us from having to do
  * MOVs to transfor (Xp, Yp) to (X, Y) after a coordinate transformation.
@@ -564,6 +656,12 @@ brw_blorp_blit_program::compute_frag_coords()
     * pixels n+2 and n+3 are in the bottom half of the subspan.
     */
    brw_ADD(&func, Y, stride(suboffset(R1, 5), 2, 4, 0), brw_imm_v(0x11001100));
+
+   /* Since we always run the WM in a mode that causes a single fragment
+    * dispatch per pixel, it's not meaningful to compute a sample value.  Just
+    * set it to 0.
+    */
+   s_is_zero = true;
 }
 
 /**
@@ -656,6 +754,86 @@ brw_blorp_blit_program::translate_tiling(bool old_tiled_w, bool new_tiled_w)
 }
 
 /**
+ * Emit code to compensate for the difference between MSAA and non-MSAA
+ * surfaces.
+ *
+ * This code modifies the X and Y coordinates according to the formula:
+ *
+ *   (X', Y') = encode_msaa_4x(X, Y, S)
+ *
+ * (See brw_blorp_blit_program).
+ */
+void
+brw_blorp_blit_program::encode_msaa(unsigned num_samples)
+{
+   if (num_samples == 0) {
+      /* No translation necessary. */
+   } else {
+      /* encode_msaa_4x(X, Y, S) = (X', Y')
+       *   where X' = (X & ~0b1) << 1 | (S & 0b1) << 1 | (X & 0b1)
+       *         Y' = (Y & ~0b1 ) << 1 | (S & 0b10) | (Y & 0b1)
+       */
+      brw_AND(&func, t1, X, brw_imm_uw(0xfffe)); /* X & ~0b1 */
+      if (!s_is_zero) {
+         brw_AND(&func, t2, S, brw_imm_uw(1)); /* S & 0b1 */
+         brw_OR(&func, t1, t1, t2); /* (X & ~0b1) | (S & 0b1) */
+      }
+      brw_SHL(&func, t1, t1, brw_imm_uw(1)); /* (X & ~0b1) << 1
+                                                | (S & 0b1) << 1 */
+      brw_AND(&func, t2, X, brw_imm_uw(1)); /* X & 0b1 */
+      brw_OR(&func, Xp, t1, t2);
+      brw_AND(&func, t1, Y, brw_imm_uw(0xfffe)); /* Y & ~0b1 */
+      brw_SHL(&func, t1, t1, brw_imm_uw(1)); /* (Y & ~0b1) << 1 */
+      if (!s_is_zero) {
+         brw_AND(&func, t2, S, brw_imm_uw(2)); /* S & 0b10 */
+         brw_OR(&func, t1, t1, t2); /* (Y & ~0b1) << 1 | (S & 0b10) */
+      }
+      brw_AND(&func, t2, Y, brw_imm_uw(1));
+      brw_OR(&func, Yp, t1, t2);
+      SWAP_XY_AND_XPYP();
+   }
+}
+
+/**
+ * Emit code to compensate for the difference between MSAA and non-MSAA
+ * surfaces.
+ *
+ * This code modifies the X and Y coordinates according to the formula:
+ *
+ *   (X', Y', S) = decode_msaa(num_samples, X, Y)
+ *
+ * (See brw_blorp_blit_program).
+ */
+void
+brw_blorp_blit_program::decode_msaa(unsigned num_samples)
+{
+   if (num_samples == 0) {
+      /* No translation necessary. */
+      s_is_zero = true;
+   } else {
+      /* decode_msaa_4x(X, Y) = (X', Y', S)
+       *   where X' = (X & ~0b11) >> 1 | (X & 0b1)
+       *         Y' = (Y & ~0b11) >> 1 | (Y & 0b1)
+       *         S = (Y & 0b10) | (X & 0b10) >> 1
+       */
+      brw_AND(&func, t1, X, brw_imm_uw(0xfffc)); /* X & ~0b11 */
+      brw_SHR(&func, t1, t1, brw_imm_uw(1)); /* (X & ~0b11) >> 1 */
+      brw_AND(&func, t2, X, brw_imm_uw(1)); /* X & 0b1 */
+      brw_OR(&func, Xp, t1, t2);
+      brw_AND(&func, t1, Y, brw_imm_uw(0xfffc)); /* Y & ~0b11 */
+      brw_SHR(&func, t1, t1, brw_imm_uw(1)); /* (Y & ~0b11) >> 1 */
+      brw_AND(&func, t2, Y, brw_imm_uw(1)); /* Y & 0b1 */
+      brw_OR(&func, Yp, t1, t2);
+      brw_AND(&func, t1, Y, brw_imm_uw(2)); /* Y & 0b10 */
+      brw_AND(&func, t2, X, brw_imm_uw(2)); /* X & 0b10 */
+      brw_SHR(&func, t2, t2, brw_imm_uw(1)); /* (X & 0b10) >> 1 */
+      brw_OR(&func, S, t1, t2);
+      s_is_zero = false;
+      SWAP_XY_AND_XPYP();
+   }
+}
+
+/**
  * Emit code that kills pixels whose X and Y coordinates are outside the
  * boundary of the rectangle defined by the push constants (dst_x0, dst_y0,
  * dst_x1, dst_y1).
@@ -694,12 +872,43 @@ brw_blorp_blit_program::translate_dst_to_src()
 }
 
 /**
+ * Emit code to transform the X and Y coordinates as needed for blending
+ * together the different samples in an MSAA texture.
+ */
+void
+brw_blorp_blit_program::single_to_blend()
+{
+   /* When looking up samples in an MSAA texture using the SAMPLE message,
+    * Gen6 requires the texture coordinates to be odd integers (so that they
+    * correspond to the center of a 2x2 block representing the four samples
+    * that maxe up a pixel).  So we need to multiply our X and Y coordinates
+    * each by 2 and then add 1.
+    */
+   brw_SHL(&func, t1, X, brw_imm_w(1));
+   brw_SHL(&func, t2, Y, brw_imm_w(1));
+   brw_ADD(&func, Xp, t1, brw_imm_w(1));
+   brw_ADD(&func, Yp, t2, brw_imm_w(1));
+   SWAP_XY_AND_XPYP();
+}
+
+/**
+ * Emit code to look up a value in the texture using the SAMPLE message (which
+ * does blending of MSAA surfaces).
+ */
+void
+brw_blorp_blit_program::sample()
+{
+   texture_lookup(GEN5_SAMPLER_MESSAGE_SAMPLE, mrf_u_float, mrf_v_float);
+}
+
+/**
  * Emit code to look up a value in the texture using the SAMPLE_LD message
  * (which does a simple texel fetch).
  */
 void
 brw_blorp_blit_program::texel_fetch()
 {
+   assert(s_is_zero);
    texture_lookup(GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
                   retype(mrf_u_float, BRW_REGISTER_TYPE_UD),
                   retype(mrf_v_float, BRW_REGISTER_TYPE_UD));
@@ -816,6 +1025,39 @@ brw_blorp_blit_params::brw_blorp_blit_params(struct intel_mipmap_tree *src_mt,
    use_wm_prog = true;
    memset(&wm_prog_key, 0, sizeof(wm_prog_key));
 
+   if (src_mt->num_samples > 0 && dst_mt->num_samples > 0) {
+      /* We are blitting from a multisample buffer to a multisample buffer, so
+       * we must preserve samples within a pixel.  This means we have to
+       * configure the render target and texture surface states as
+       * single-sampled, so that the WM program can access each sample
+       * individually.
+       */
+      src.num_samples = dst.num_samples = 0;
+   }
+
+   /* The render path must be configured to use the same number of samples as
+    * the destination buffer.
+    */
+   num_samples = dst.num_samples;
+
+   GLenum base_format = _mesa_get_format_base_format(src_mt->format);
+   if (base_format != GL_DEPTH_COMPONENT && /* TODO: what about depth/stencil? */
+       base_format != GL_STENCIL_INDEX &&
+       src_mt->num_samples > 0 && dst_mt->num_samples == 0) {
+      /* We are downsampling a color buffer, so blend. */
+      wm_prog_key.blend = true;
+   }
+
+   /* src_samples and dst_samples are the true sample counts */
+   wm_prog_key.src_samples = src_mt->num_samples;
+   wm_prog_key.dst_samples = dst_mt->num_samples;
+
+   /* tex_samples and rt_samples are the sample counts that are set up in
+    * SURFACE_STATE.
+    */
+   wm_prog_key.tex_samples = src.num_samples;
+   wm_prog_key.rt_samples  = dst.num_samples;
+
    wm_prog_key.src_tiled_w = src.map_stencil_as_y_tiled;
    wm_prog_key.dst_tiled_w = dst.map_stencil_as_y_tiled;
    x0 = wm_push_consts.dst_x0 = dst_x0;
@@ -825,6 +1067,22 @@ brw_blorp_blit_params::brw_blorp_blit_params(struct intel_mipmap_tree *src_mt,
    wm_push_consts.x_transform.setup(src_x0, dst_x0, dst_x1, mirror_x);
    wm_push_consts.y_transform.setup(src_y0, dst_y0, dst_y1, mirror_y);
 
+   if (dst.num_samples == 0 && dst_mt->num_samples > 0) {
+      /* We must expand the rectangle we send through the rendering pipeline,
+       * to account for the fact that we are mapping the destination region as
+       * single-sampled when it is in fact multisampled.  We must also align
+       * it to a multiple of the multisampling pattern, because the
+       * differences between multisampled and single-sampled surface formats
+       * will mean that pixels are scrambled within the multisampling pattern.
+       * TODO: what if this makes the coordinates too large?
+       */
+      x0 = (x0 * 2) & ~3;
+      y0 = (y0 * 2) & ~3;
+      x1 = ALIGN(x1 * 2, 4);
+      y1 = ALIGN(y1 * 2, 4);
+      wm_prog_key.use_kill = true;
+   }
+
    if (dst.map_stencil_as_y_tiled) {
       /* We must modify the rectangle we send through the rendering pipeline,
        * to account for the fact that we are mapping it as Y-tiled when it is
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 8ffd208..a768416 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -1099,6 +1099,14 @@ brw_blorp_framebuffer(struct intel_context *intel,
                       GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
                       GLbitfield mask, GLenum filter);
 
+/* gen6_multisample_state.c */
+void
+gen6_emit_3dstate_multisample(struct brw_context *brw,
+                              unsigned num_samples);
+void
+gen6_emit_3dstate_sample_mask(struct brw_context *brw,
+                              unsigned num_samples);
+
 
 
 /*======================================================================
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 01bad5c..aaab5a2 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -456,6 +456,11 @@
 /* Surface state DW4 */
 #define BRW_SURFACE_MIN_LOD_SHIFT	28
 #define BRW_SURFACE_MIN_LOD_MASK	INTEL_MASK(31, 28)
+#define BRW_SURFACE_MULTISAMPLECOUNT_1  (0 << 4)
+#define BRW_SURFACE_MULTISAMPLECOUNT_4  (2 << 4)
+#define GEN7_SURFACE_MULTISAMPLECOUNT_1 0
+#define GEN7_SURFACE_MULTISAMPLECOUNT_4 2
+#define GEN7_SURFACE_MULTISAMPLECOUNT_8 3
 
 /* Surface state DW5 */
 #define BRW_SURFACE_X_OFFSET_SHIFT		25
@@ -1305,6 +1310,7 @@ enum brw_wm_barycentric_interp_mode {
 # define GEN6_WM_MSRAST_OFF_PATTERN			(1 << 1)
 # define GEN6_WM_MSRAST_ON_PIXEL			(2 << 1)
 # define GEN6_WM_MSRAST_ON_PATTERN			(3 << 1)
+# define GEN6_WM_MSDISPMODE_PERSAMPLE			(0 << 0)
 # define GEN6_WM_MSDISPMODE_PERPIXEL			(1 << 0)
 /* DW7: kernel 1 pointer */
 /* DW8: kernel 2 pointer */
@@ -1388,6 +1394,7 @@ enum brw_wm_barycentric_interp_mode {
 # define GEN7_WM_MSRAST_ON_PIXEL			(2 << 0)
 # define GEN7_WM_MSRAST_ON_PATTERN			(3 << 0)
 /* DW2 */
+# define GEN7_WM_MSDISPMODE_PERSAMPLE			(0 << 31)
 # define GEN7_WM_MSDISPMODE_PERPIXEL			(1 << 31)
 
 #define _3DSTATE_PS				0x7820 /* GEN7+ */
diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c
index 0c0389f..b00278a 100644
--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -782,33 +782,16 @@ static void upload_invariant_state( struct brw_context *brw )
       ADVANCE_BATCH();
    }
 
-   if (intel->gen >= 6) {
+   if (intel->gen == 6) {
       int i;
-      int len = intel->gen >= 7 ? 4 : 3;
-
-      BEGIN_BATCH(len);
-      OUT_BATCH(_3DSTATE_MULTISAMPLE << 16 | (len - 2));
-      OUT_BATCH(MS_PIXEL_LOCATION_CENTER |
-		MS_NUMSAMPLES_1);
-      OUT_BATCH(0); /* positions for 4/8-sample */
-      if (intel->gen >= 7)
-	 OUT_BATCH(0);
-      ADVANCE_BATCH();
 
-      BEGIN_BATCH(2);
-      OUT_BATCH(_3DSTATE_SAMPLE_MASK << 16 | (2 - 2));
-      OUT_BATCH(1);
-      ADVANCE_BATCH();
-
-      if (intel->gen < 7) {
-	 for (i = 0; i < 4; i++) {
-	    BEGIN_BATCH(4);
-	    OUT_BATCH(_3DSTATE_GS_SVB_INDEX << 16 | (4 - 2));
-	    OUT_BATCH(i << SVB_INDEX_SHIFT);
-	    OUT_BATCH(0);
-	    OUT_BATCH(0xffffffff);
-	    ADVANCE_BATCH();
-	 }
+      for (i = 0; i < 4; i++) {
+         BEGIN_BATCH(4);
+         OUT_BATCH(_3DSTATE_GS_SVB_INDEX << 16 | (4 - 2));
+         OUT_BATCH(i << SVB_INDEX_SHIFT);
+         OUT_BATCH(0);
+         OUT_BATCH(0xffffffff);
+         ADVANCE_BATCH();
       }
    }
 
diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index 9e37361..89d0963 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -95,6 +95,7 @@ extern const struct brw_tracked_state gen6_color_calc_state;
 extern const struct brw_tracked_state gen6_depth_stencil_state;
 extern const struct brw_tracked_state gen6_gs_state;
 extern const struct brw_tracked_state gen6_gs_binding_table;
+extern const struct brw_tracked_state gen6_multisample_state;
 extern const struct brw_tracked_state gen6_renderbuffer_surfaces;
 extern const struct brw_tracked_state gen6_sampler_state;
 extern const struct brw_tracked_state gen6_scissor_state;
@@ -181,6 +182,7 @@ void *brw_state_batch(struct brw_context *brw,
 /* brw_wm_surface_state.c */
 void gen4_init_vtable_surface_functions(struct brw_context *brw);
 uint32_t brw_get_surface_tiling_bits(uint32_t tiling);
+uint32_t brw_get_surface_num_multisamples(unsigned num_samples);
 void brw_create_constant_surface(struct brw_context *brw,
 				 drm_intel_bo *bo,
 				 int width,
@@ -197,6 +199,8 @@ GLuint translate_tex_format(gl_format mesa_format,
 
 /* gen7_wm_surface_state.c */
 void gen7_set_surface_tiling(struct gen7_surface_state *surf, uint32_t tiling);
+void gen7_set_surface_num_multisamples(struct gen7_surface_state *surf,
+                                       unsigned num_samples);
 void gen7_init_vtable_surface_functions(struct brw_context *brw);
 void gen7_create_constant_surface(struct brw_context *brw,
 				  drm_intel_bo *bo,
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
index b02e160..551fa6a 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -153,6 +153,7 @@ static const struct brw_tracked_state *gen6_atoms[] =
 
    &brw_samplers,
    &gen6_sampler_state,
+   &gen6_multisample_state, /* TODO: is this the right spot? */
 
    &gen6_vs_state,
    &gen6_gs_state,
@@ -221,6 +222,7 @@ const struct brw_tracked_state *gen7_atoms[] =
    &brw_wm_binding_table,
 
    &gen7_samplers,
+   &gen6_multisample_state, /* TODO: is this the right spot? */
 
    &gen7_disable_stages,
    &gen7_vs_state,
diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
index 104d475..849da85 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -633,6 +633,17 @@ brw_get_surface_tiling_bits(uint32_t tiling)
    }
 }
 
+
+uint32_t
+brw_get_surface_num_multisamples(unsigned num_samples)
+{
+   if (num_samples > 0)
+      return BRW_SURFACE_MULTISAMPLECOUNT_4;
+   else
+      return BRW_SURFACE_MULTISAMPLECOUNT_1;
+}
+
+
 static void
 brw_update_texture_surface( struct gl_context *ctx, GLuint unit )
 {
@@ -943,7 +954,8 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
 				       intel_image->base.Base.Level,
 				       intel_image->base.Base.Level,
 				       width, height, depth,
-				       true);
+				       true,
+                                       0 /* num_samples */);
 
 	 intel_miptree_copy_teximage(intel, intel_image, new_mt);
 	 intel_miptree_reference(&irb->mt, intel_image->mt);
@@ -993,7 +1005,7 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
    surf[3] = (brw_get_surface_tiling_bits(region->tiling) |
 	      ((region->pitch * region->cpp) - 1) << BRW_SURFACE_PITCH_SHIFT);
 
-   surf[4] = 0;
+   surf[4] = brw_get_surface_num_multisamples(mt->num_samples);
 
    assert(brw->has_surface_tile_offset || (tile_x == 0 && tile_y == 0));
    /* Note that the low bits of these fields are missing, so
diff --git a/src/mesa/drivers/dri/i965/gen6_blorp.cpp b/src/mesa/drivers/dri/i965/gen6_blorp.cpp
index 00aeda6..6db8f40 100644
--- a/src/mesa/drivers/dri/i965/gen6_blorp.cpp
+++ b/src/mesa/drivers/dri/i965/gen6_blorp.cpp
@@ -100,28 +100,8 @@ gen6_blorp_emit_batch_head(struct brw_context *brw,
       ADVANCE_BATCH();
    }
 
-   /* 3DSTATE_MULTISAMPLE */
-   {
-      int length = intel->gen == 7 ? 4 : 3;
-
-      BEGIN_BATCH(length);
-      OUT_BATCH(_3DSTATE_MULTISAMPLE << 16 | (length - 2));
-      OUT_BATCH(MS_PIXEL_LOCATION_CENTER |
-                MS_NUMSAMPLES_1);
-      OUT_BATCH(0);
-      if (length >= 4)
-         OUT_BATCH(0);
-      ADVANCE_BATCH();
-
-   }
-
-   /* 3DSTATE_SAMPLE_MASK */
-   {
-      BEGIN_BATCH(2);
-      OUT_BATCH(_3DSTATE_SAMPLE_MASK << 16 | (2 - 2));
-      OUT_BATCH(1);
-      ADVANCE_BATCH();
-   }
+   gen6_emit_3dstate_multisample(brw, params->num_samples);
+   gen6_emit_3dstate_sample_mask(brw, params->num_samples);
 
    /* CMD_STATE_BASE_ADDRESS
     *
@@ -426,6 +406,10 @@ gen6_blorp_emit_surface_state(struct brw_context *brw,
    uint32_t wm_surf_offset;
    uint32_t width, height;
    surface->get_miplevel_dims(&width, &height);
+   if (surface->num_samples > 0) { /* TODO: seems clumsy */
+      width /= 2;
+      height /= 2;
+   }
    if (surface->map_stencil_as_y_tiled) {
       width *= 2;
       height /= 2;
@@ -462,7 +446,7 @@ gen6_blorp_emit_surface_state(struct brw_context *brw,
               0 << BRW_SURFACE_DEPTH_SHIFT |
               (pitch_bytes - 1) << BRW_SURFACE_PITCH_SHIFT);
 
-   surf[4] = 0;
+   surf[4] = brw_get_surface_num_multisamples(surface->num_samples);
 
    surf[5] = (0 << BRW_SURFACE_X_OFFSET_SHIFT |
               0 << BRW_SURFACE_Y_OFFSET_SHIFT |
@@ -695,7 +679,9 @@ gen6_blorp_emit_sf_config(struct brw_context *brw,
    OUT_BATCH((1 - 1) << GEN6_SF_NUM_OUTPUTS_SHIFT | /* only position */
              1 << GEN6_SF_URB_ENTRY_READ_LENGTH_SHIFT |
              0 << GEN6_SF_URB_ENTRY_READ_OFFSET_SHIFT);
-   for (int i = 0; i < 18; ++i)
+   OUT_BATCH(0); /* dw2 */
+   OUT_BATCH(params->num_samples > 0 ? GEN6_SF_MSRAST_ON_PATTERN : 0);
+   for (int i = 0; i < 16; ++i)
       OUT_BATCH(0);
    ADVANCE_BATCH();
 }
@@ -754,6 +740,14 @@ gen6_blorp_emit_wm_config(struct brw_context *brw,
       dw5 |= GEN6_WM_DISPATCH_ENABLE; /* We are rendering */
    }
 
+   if (params->num_samples > 0) {
+      dw6 |= GEN6_WM_MSRAST_ON_PATTERN;
+      dw6 |= GEN6_WM_MSDISPMODE_PERPIXEL;
+   } else {
+      dw6 |= GEN6_WM_MSRAST_OFF_PIXEL;
+      dw6 |= GEN6_WM_MSDISPMODE_PERSAMPLE;
+   }
+
    BEGIN_BATCH(9);
    OUT_BATCH(_3DSTATE_WM << 16 | (9 - 2));
    OUT_BATCH(params->use_wm_prog ? prog_offset : 0);
@@ -761,7 +755,7 @@ gen6_blorp_emit_wm_config(struct brw_context *brw,
    OUT_BATCH(0); /* No scratch needed */
    OUT_BATCH(dw4);
    OUT_BATCH(dw5);
-   OUT_BATCH(dw6); /* only position */
+   OUT_BATCH(dw6);
    OUT_BATCH(0); /* No other programs */
    OUT_BATCH(0); /* No other programs */
    ADVANCE_BATCH();
diff --git a/src/mesa/drivers/dri/i965/gen6_multisample_state.c b/src/mesa/drivers/dri/i965/gen6_multisample_state.c
new file mode 100644
index 0000000..e01ead1
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/gen6_multisample_state.c
@@ -0,0 +1,102 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "intel_batchbuffer.h"
+
+#include "brw_context.h"
+#include "brw_defines.h"
+
+
+/**
+ * 3DSTATE_MULTISAMPLE
+ */
+void
+gen6_emit_3dstate_multisample(struct brw_context *brw,
+                              unsigned num_samples)
+{
+   struct intel_context *intel = &brw->intel;
+
+   /* TODO: MSAA only implemented on Gen6 */
+   if (intel->gen != 6) {
+      assert(num_samples == 0);
+   }
+
+   int len = intel->gen >= 7 ? 4 : 3;
+   BEGIN_BATCH(len);
+   OUT_BATCH(_3DSTATE_MULTISAMPLE << 16 | (len - 2));
+   OUT_BATCH(MS_PIXEL_LOCATION_CENTER |
+             (num_samples > 0 ? MS_NUMSAMPLES_4 : MS_NUMSAMPLES_1));
+   OUT_BATCH(num_samples > 0 ? 0xae2ae662 : 0); /* positions for 4/8-sample */
+   if (intel->gen >= 7)
+      OUT_BATCH(0);
+   ADVANCE_BATCH();
+}
+
+
+/**
+ * 3DSTATE_SAMPLE_MASK
+ */
+void
+gen6_emit_3dstate_sample_mask(struct brw_context *brw,
+                              unsigned num_samples)
+{
+   struct intel_context *intel = &brw->intel;
+
+   /* TODO: MSAA only implemented on Gen6 */
+   if (intel->gen != 6) {
+      assert(num_samples == 0);
+   }
+
+   BEGIN_BATCH(2);
+   OUT_BATCH(_3DSTATE_SAMPLE_MASK << 16 | (2 - 2));
+   OUT_BATCH(num_samples > 0 ? 15 : 1);
+   ADVANCE_BATCH();
+}
+
+
+static void upload_multisample_state(struct brw_context *brw)
+{
+   struct intel_context *intel = &brw->intel;
+   struct gl_context *ctx = &intel->ctx;
+   unsigned num_samples = 0;
+
+   /* _NEW_BUFFERS */
+   if (ctx->DrawBuffer->_ColorDrawBuffers[0])
+      num_samples = ctx->DrawBuffer->_ColorDrawBuffers[0]->NumSamples;
+
+   /* 3DSTATE_MULTISAMPLE is nonpipelined. */
+   intel_emit_post_sync_nonzero_flush(intel);
+
+   gen6_emit_3dstate_multisample(brw, num_samples);
+   gen6_emit_3dstate_sample_mask(brw, num_samples);
+}
+
+
+const struct brw_tracked_state gen6_multisample_state = {
+   .dirty = {
+      .mesa = _NEW_BUFFERS,
+      .brw = BRW_NEW_CONTEXT,
+      .cache = 0
+   },
+   .emit = upload_multisample_state
+};
diff --git a/src/mesa/drivers/dri/i965/gen6_sf_state.c b/src/mesa/drivers/dri/i965/gen6_sf_state.c
index 5c4293c..e0aaa90 100644
--- a/src/mesa/drivers/dri/i965/gen6_sf_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_sf_state.c
@@ -122,6 +122,10 @@ upload_sf_state(struct brw_context *brw)
    int i;
    /* _NEW_BUFFER */
    bool render_to_fbo = _mesa_is_user_fbo(brw->intel.ctx.DrawBuffer);
+   bool multisampled = false;
+   if (ctx->DrawBuffer->_ColorDrawBuffers[0])
+      multisampled = ctx->DrawBuffer->_ColorDrawBuffers[0]->NumSamples > 0;
+
    int attr = 0, input_index = 0;
    int urb_entry_read_offset = 1;
    float point_size;
@@ -226,13 +230,20 @@ upload_sf_state(struct brw_context *brw)
    }
 
    /* _NEW_LINE */
-   dw3 |= U_FIXED(CLAMP(ctx->Line.Width, 0.0, 7.99), 7) <<
-      GEN6_SF_LINE_WIDTH_SHIFT;
+   {
+      uint32_t line_width_u3_7 = U_FIXED(CLAMP(ctx->Line.Width, 0.0, 7.99), 7);
+      /* TODO: line width of 0 is not allowed when MSAA enabled */
+      if (line_width_u3_7 == 0)
+         line_width_u3_7 = 1;
+      dw3 |= line_width_u3_7 << GEN6_SF_LINE_WIDTH_SHIFT;
+   }
    if (ctx->Line.SmoothFlag) {
       dw3 |= GEN6_SF_LINE_AA_ENABLE;
       dw3 |= GEN6_SF_LINE_AA_MODE_TRUE;
       dw3 |= GEN6_SF_LINE_END_CAP_WIDTH_1_0;
    }
+   if (multisampled)
+      dw3 |= GEN6_SF_MSRAST_ON_PATTERN;
 
    /* _NEW_PROGRAM | _NEW_POINT */
    if (!(ctx->VertexProgram.PointSizeEnabled ||
diff --git a/src/mesa/drivers/dri/i965/gen6_wm_state.c b/src/mesa/drivers/dri/i965/gen6_wm_state.c
index fd1eca4..28b3c29 100644
--- a/src/mesa/drivers/dri/i965/gen6_wm_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_wm_state.c
@@ -98,6 +98,11 @@ upload_wm_state(struct brw_context *brw)
    const struct brw_fragment_program *fp =
       brw_fragment_program_const(brw->fragment_program);
    uint32_t dw2, dw4, dw5, dw6;
+   bool multisampled = false;
+
+   /* _NEW_BUFFERS */
+   if (ctx->DrawBuffer->_ColorDrawBuffers[0])
+      multisampled = ctx->DrawBuffer->_ColorDrawBuffers[0]->NumSamples > 0;
 
     /* CACHE_NEW_WM_PROG */
    if (brw->wm.prog_data->nr_params == 0) {
@@ -185,6 +190,13 @@ upload_wm_state(struct brw_context *brw)
 
    dw6 |= _mesa_bitcount_64(brw->fragment_program->Base.InputsRead) <<
       GEN6_WM_NUM_SF_OUTPUTS_SHIFT;
+   if (multisampled) {
+      dw6 |= GEN6_WM_MSRAST_ON_PATTERN;
+      dw6 |= GEN6_WM_MSDISPMODE_PERPIXEL;
+   } else {
+      dw6 |= GEN6_WM_MSRAST_OFF_PIXEL;
+      dw6 |= GEN6_WM_MSDISPMODE_PERSAMPLE;
+   }
 
    BEGIN_BATCH(9);
    OUT_BATCH(_3DSTATE_WM << 16 | (9 - 2));
diff --git a/src/mesa/drivers/dri/i965/gen7_blorp.cpp b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
index f10d0aa..fbb94df 100644
--- a/src/mesa/drivers/dri/i965/gen7_blorp.cpp
+++ b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
@@ -143,6 +143,10 @@ gen7_blorp_emit_surface_state(struct brw_context *brw,
    uint32_t wm_surf_offset;
    uint32_t width, height;
    surface->get_miplevel_dims(&width, &height);
+   if (surface->num_samples > 0) { /* TODO: wrong for 8x */
+      width /= 2;
+      height /= 2;
+   }
    if (surface->map_stencil_as_y_tiled) {
       width *= 2;
       height /= 2;
@@ -181,6 +185,8 @@ gen7_blorp_emit_surface_state(struct brw_context *brw,
       pitch_bytes *= 2;
    surf->ss3.pitch = pitch_bytes - 1;
 
+   gen7_set_surface_num_multisamples(surf, surface->num_samples);
+
    if (intel->is_haswell) {
       surf->ss7.shader_chanel_select_r = HSW_SCS_RED;
       surf->ss7.shader_chanel_select_g = HSW_SCS_GREEN;
@@ -366,7 +372,7 @@ gen7_blorp_emit_sf_config(struct brw_context *brw,
       OUT_BATCH(_3DSTATE_SF << 16 | (7 - 2));
       OUT_BATCH(params->depth_format <<
                 GEN7_SF_DEPTH_BUFFER_SURFACE_FORMAT_SHIFT);
-      OUT_BATCH(0);
+      OUT_BATCH(params->num_samples > 0 ? GEN6_SF_MSRAST_ON_PATTERN : 0);
       OUT_BATCH(0);
       OUT_BATCH(0);
       OUT_BATCH(0);
@@ -397,7 +403,7 @@ gen7_blorp_emit_wm_config(struct brw_context *brw,
 {
    struct intel_context *intel = &brw->intel;
 
-   uint32_t dw1 = 0;
+   uint32_t dw1 = 0, dw2 = 0;
 
    switch (params->hiz_op) {
    case GEN6_HIZ_OP_DEPTH_CLEAR:
@@ -423,10 +429,18 @@ gen7_blorp_emit_wm_config(struct brw_context *brw,
       dw1 |= GEN7_WM_DISPATCH_ENABLE; /* We are rendering */
    }
 
+      if (params->num_samples > 0) {
+         dw1 |= GEN7_WM_MSRAST_ON_PATTERN;
+         dw2 |= GEN7_WM_MSDISPMODE_PERPIXEL;
+      } else {
+         dw1 |= GEN7_WM_MSRAST_OFF_PIXEL;
+         dw2 |= GEN7_WM_MSDISPMODE_PERSAMPLE;
+      }
+
    BEGIN_BATCH(3);
    OUT_BATCH(_3DSTATE_WM << 16 | (3 - 2));
    OUT_BATCH(dw1);
-   OUT_BATCH(0);
+   OUT_BATCH(dw2);
    ADVANCE_BATCH();
 }
 
diff --git a/src/mesa/drivers/dri/i965/gen7_sf_state.c b/src/mesa/drivers/dri/i965/gen7_sf_state.c
index 5c6fced..8a6c09b 100644
--- a/src/mesa/drivers/dri/i965/gen7_sf_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_sf_state.c
@@ -161,6 +161,9 @@ upload_sf_state(struct brw_context *brw)
    float point_size;
    /* _NEW_BUFFERS */
    bool render_to_fbo = _mesa_is_user_fbo(brw->intel.ctx.DrawBuffer);
+   bool multisampled = false;
+   if (ctx->DrawBuffer->_ColorDrawBuffers[0])
+      multisampled = ctx->DrawBuffer->_ColorDrawBuffers[0]->NumSamples > 0;
 
    dw1 = GEN6_SF_STATISTICS_ENABLE |
          GEN6_SF_VIEWPORT_TRANSFORM_ENABLE;
@@ -243,8 +246,13 @@ upload_sf_state(struct brw_context *brw)
       dw2 |= GEN6_SF_SCISSOR_ENABLE;
 
    /* _NEW_LINE */
-   dw2 |= U_FIXED(CLAMP(ctx->Line.Width, 0.0, 7.99), 7) <<
-      GEN6_SF_LINE_WIDTH_SHIFT;
+   {
+      uint32_t line_width_u3_7 = U_FIXED(CLAMP(ctx->Line.Width, 0.0, 7.99), 7);
+      /* TODO: line width of 0 is not allowed when MSAA enabled */
+      if (line_width_u3_7 == 0)
+         line_width_u3_7 = 1;
+      dw2 |= line_width_u3_7 << GEN6_SF_LINE_WIDTH_SHIFT;
+   }
    if (ctx->Line.SmoothFlag) {
       dw2 |= GEN6_SF_LINE_AA_ENABLE;
       dw2 |= GEN6_SF_LINE_AA_MODE_TRUE;
@@ -253,6 +261,8 @@ upload_sf_state(struct brw_context *brw)
    if (ctx->Line.StippleFlag && intel->is_haswell) {
       dw2 |= HSW_SF_LINE_STIPPLE_ENABLE;
    }
+   if (multisampled)
+      dw2 |= GEN6_SF_MSRAST_ON_PATTERN;
 
    /* FINISHME: Last Pixel Enable?  Vertex Sub Pixel Precision Select?
     * FINISHME: AA Line Distance Mode?
diff --git a/src/mesa/drivers/dri/i965/gen7_wm_state.c b/src/mesa/drivers/dri/i965/gen7_wm_state.c
index 024c855..2a0462f 100644
--- a/src/mesa/drivers/dri/i965/gen7_wm_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_wm_state.c
@@ -39,9 +39,14 @@ upload_wm_state(struct brw_context *brw)
    const struct brw_fragment_program *fp =
       brw_fragment_program_const(brw->fragment_program);
    bool writes_depth = false;
-   uint32_t dw1;
+   bool multisampled = false;
+   uint32_t dw1, dw2;
 
-   dw1 = 0;
+   /* _NEW_BUFFERS */
+   if (ctx->DrawBuffer->_ColorDrawBuffers[0])
+      multisampled = ctx->DrawBuffer->_ColorDrawBuffers[0]->NumSamples > 0;
+
+   dw1 = dw2 = 0;
    dw1 |= GEN7_WM_STATISTICS_ENABLE;
    dw1 |= GEN7_WM_LINE_AA_WIDTH_1_0;
    dw1 |= GEN7_WM_LINE_END_CAP_AA_WIDTH_0_5;
@@ -74,11 +79,18 @@ upload_wm_state(struct brw_context *brw)
        dw1 & GEN7_WM_KILL_ENABLE) {
       dw1 |= GEN7_WM_DISPATCH_ENABLE;
    }
+   if (multisampled) {
+      dw1 |= GEN7_WM_MSRAST_ON_PATTERN;
+      dw2 |= GEN7_WM_MSDISPMODE_PERPIXEL;
+   } else {
+      dw1 |= GEN7_WM_MSRAST_OFF_PIXEL;
+      dw2 |= GEN7_WM_MSDISPMODE_PERSAMPLE;
+   }
 
    BEGIN_BATCH(3);
    OUT_BATCH(_3DSTATE_WM << 16 | (3 - 2));
    OUT_BATCH(dw1);
-   OUT_BATCH(0);
+   OUT_BATCH(dw2);
    ADVANCE_BATCH();
 }
 
diff --git a/src/mesa/drivers/dri/i965/gen7_wm_surface_state.c b/src/mesa/drivers/dri/i965/gen7_wm_surface_state.c
index d84e075..5aa62bd 100644
--- a/src/mesa/drivers/dri/i965/gen7_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_wm_surface_state.c
@@ -54,6 +54,20 @@ gen7_set_surface_tiling(struct gen7_surface_state *surf, uint32_t tiling)
    }
 }
 
+
+void
+gen7_set_surface_num_multisamples(struct gen7_surface_state *surf,
+                                  unsigned num_samples)
+{
+   if (num_samples > 4)
+      surf->ss4.num_multisamples = GEN7_SURFACE_MULTISAMPLECOUNT_8;
+   else if (num_samples > 0)
+      surf->ss4.num_multisamples = GEN7_SURFACE_MULTISAMPLECOUNT_4;
+   else
+      surf->ss4.num_multisamples = GEN7_SURFACE_MULTISAMPLECOUNT_1;
+}
+
+
 static void
 gen7_update_buffer_texture_surface(struct gl_context *ctx, GLuint unit)
 {
@@ -328,6 +342,8 @@ gen7_update_renderbuffer_surface(struct brw_context *brw,
    gen7_set_surface_tiling(surf, region->tiling);
    surf->ss3.pitch = (region->pitch * region->cpp) - 1;
 
+   gen7_set_surface_num_multisamples(surf, irb->mt->num_samples);
+
    if (intel->is_haswell) {
       surf->ss7.shader_chanel_select_r = HSW_SCS_RED;
       surf->ss7.shader_chanel_select_g = HSW_SCS_GREEN;
diff --git a/src/mesa/drivers/dri/intel/intel_fbo.c b/src/mesa/drivers/dri/intel/intel_fbo.c
index f92d78f..bbd5f66 100644
--- a/src/mesa/drivers/dri/intel/intel_fbo.c
+++ b/src/mesa/drivers/dri/intel/intel_fbo.c
@@ -189,6 +189,29 @@ intel_unmap_renderbuffer(struct gl_context *ctx,
 
 
 /**
+ * Round up the requested multisample count to the next supported sample size.
+ */
+static unsigned
+quantize_num_samples(struct intel_context *intel, unsigned num_samples)
+{
+   switch (intel->gen) {
+   case 6:
+      /* Gen6 supports only 4x multisampling. */
+      if (num_samples > 0)
+         return 4;
+      else
+         return 0;
+   case 7:
+      /* TODO: MSAA only implemented on Gen6 */
+      return 0;
+   default:
+      /* MSAA unsupported */
+      return 0;
+   }
+}
+
+
+/**
  * Called via glRenderbufferStorageEXT() to set the format and allocate
  * storage for a user-created renderbuffer.
  */
@@ -199,6 +222,7 @@ intel_alloc_renderbuffer_storage(struct gl_context * ctx, struct gl_renderbuffer
 {
    struct intel_context *intel = intel_context(ctx);
    struct intel_renderbuffer *irb = intel_renderbuffer(rb);
+   rb->NumSamples = quantize_num_samples(intel, rb->NumSamples);
 
    ASSERT(rb->Name != 0);
 
@@ -241,12 +265,13 @@ intel_alloc_renderbuffer_storage(struct gl_context * ctx, struct gl_renderbuffer
       return true;
 
    irb->mt = intel_miptree_create_for_renderbuffer(intel, rb->Format,
-						   width, height);
+						   width, height,
+                                                   rb->NumSamples);
    if (!irb->mt)
       return false;
 
    if (intel->vtbl.is_hiz_depth_format(intel, rb->Format)) {
-      bool ok = intel_miptree_alloc_hiz(intel, irb->mt);
+      bool ok = intel_miptree_alloc_hiz(intel, irb->mt, rb->NumSamples);
       if (!ok) {
 	 intel_miptree_release(&irb->mt);
 	 return false;
@@ -495,7 +520,7 @@ intel_renderbuffer_update_wrapper(struct intel_context *intel,
 
    if (mt->hiz_mt == NULL &&
        intel->vtbl.is_hiz_depth_format(intel, rb->Format)) {
-      intel_miptree_alloc_hiz(intel, mt);
+      intel_miptree_alloc_hiz(intel, mt, 0 /* num_samples */);
       if (!mt->hiz_mt)
 	 return false;
    }
diff --git a/src/mesa/drivers/dri/intel/intel_mipmap_tree.c b/src/mesa/drivers/dri/intel/intel_mipmap_tree.c
index 91ebc8d..99f4230 100644
--- a/src/mesa/drivers/dri/intel/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/intel/intel_mipmap_tree.c
@@ -72,7 +72,8 @@ intel_miptree_create_internal(struct intel_context *intel,
 			      GLuint width0,
 			      GLuint height0,
 			      GLuint depth0,
-			      bool for_region)
+			      bool for_region,
+                              GLuint num_samples)
 {
    struct intel_mipmap_tree *mt = calloc(sizeof(*mt), 1);
    int compress_byte = 0;
@@ -92,6 +93,7 @@ intel_miptree_create_internal(struct intel_context *intel,
    mt->width0 = width0;
    mt->height0 = height0;
    mt->cpp = compress_byte ? compress_byte : _mesa_get_format_bytes(mt->format);
+   mt->num_samples = num_samples;
    mt->compressed = compress_byte ? 1 : 0;
    mt->refcount = 1; 
 
@@ -115,7 +117,8 @@ intel_miptree_create_internal(struct intel_context *intel,
                                             mt->width0,
                                             mt->height0,
                                             mt->depth0,
-                                            true);
+                                            true,
+                                            num_samples);
       if (!mt->stencil_mt) {
 	 intel_miptree_release(&mt);
 	 return NULL;
@@ -161,7 +164,8 @@ intel_miptree_create(struct intel_context *intel,
 		     GLuint width0,
 		     GLuint height0,
 		     GLuint depth0,
-		     bool expect_accelerated_upload)
+		     bool expect_accelerated_upload,
+                     GLuint num_samples)
 {
    struct intel_mipmap_tree *mt;
    uint32_t tiling = I915_TILING_NONE;
@@ -172,7 +176,21 @@ intel_miptree_create(struct intel_context *intel,
 	  (base_format == GL_DEPTH_COMPONENT ||
 	   base_format == GL_DEPTH_STENCIL_EXT))
 	 tiling = I915_TILING_Y;
-      else if (width0 >= 64)
+      else if (num_samples > 0) {
+         /* From p82 of the Sandy Bridge PRM, dw3[1] of SURFACE_STATE ("Tiled
+          * Surface"):
+          *
+          *   [DevSNB+]: For multi-sample render targets, this field must be
+          *   1. MSRTs can only be tiled.
+          *
+          * Our usual reason for preferring X tiling (fast blits using the
+          * blitting engine) doesn't apply to MSAA, since we'll generally be
+          * downsampling or upsampling when blitting between the MSAA buffer
+          * and another buffer, and the blitting engine doesn't support that.
+          * So use Y tiling, since it makes better use of the cache.
+          */
+         tiling = I915_TILING_Y;
+      } else if (width0 >= 64)
 	 tiling = I915_TILING_X;
    }
 
@@ -189,7 +207,7 @@ intel_miptree_create(struct intel_context *intel,
    mt = intel_miptree_create_internal(intel, target, format,
 				      first_level, last_level, width0,
 				      height0, depth0,
-				      false);
+				      false, num_samples);
    /*
     * pitch == 0 || height == 0  indicates the null texture
     */
@@ -225,7 +243,7 @@ intel_miptree_create_for_region(struct intel_context *intel,
    mt = intel_miptree_create_internal(intel, target, format,
 				      0, 0,
 				      region->width, region->height, 1,
-				      true);
+				      true, 0 /* num_samples */);
    if (!mt)
       return mt;
 
@@ -238,12 +256,24 @@ struct intel_mipmap_tree*
 intel_miptree_create_for_renderbuffer(struct intel_context *intel,
                                       gl_format format,
                                       uint32_t width,
-                                      uint32_t height)
+                                      uint32_t height,
+                                      uint32_t num_samples)
 {
    struct intel_mipmap_tree *mt;
 
+   /* Adjust width/height for MSAA */
+   if (num_samples > 4) {
+      num_samples = 8;
+      width *= 4;
+      height *= 2;
+   } else if (num_samples > 0) {
+      num_samples = 4;
+      width *= 2;
+      height *= 2;
+   }
+
    mt = intel_miptree_create(intel, GL_TEXTURE_2D, format, 0, 0,
-			     width, height, 1, true);
+			     width, height, 1, true, num_samples);
 
    return mt;
 }
@@ -513,7 +543,8 @@ intel_miptree_copy_teximage(struct intel_context *intel,
 
 bool
 intel_miptree_alloc_hiz(struct intel_context *intel,
-			struct intel_mipmap_tree *mt)
+			struct intel_mipmap_tree *mt,
+                        GLuint num_samples)
 {
    assert(mt->hiz_mt == NULL);
    mt->hiz_mt = intel_miptree_create(intel,
@@ -524,7 +555,8 @@ intel_miptree_alloc_hiz(struct intel_context *intel,
                                      mt->width0,
                                      mt->height0,
                                      mt->depth0,
-                                     true);
+                                     true,
+                                     num_samples);
 
    if (!mt->hiz_mt)
       return false;
diff --git a/src/mesa/drivers/dri/intel/intel_mipmap_tree.h b/src/mesa/drivers/dri/intel/intel_mipmap_tree.h
index 0886c95..ca1666d 100644
--- a/src/mesa/drivers/dri/intel/intel_mipmap_tree.h
+++ b/src/mesa/drivers/dri/intel/intel_mipmap_tree.h
@@ -169,6 +169,7 @@ struct intel_mipmap_tree
 
    GLuint width0, height0, depth0; /**< Level zero image dimensions */
    GLuint cpp;
+   GLuint num_samples;
    bool compressed;
 
    /* Derived from the above:
@@ -231,7 +232,8 @@ struct intel_mipmap_tree *intel_miptree_create(struct intel_context *intel,
                                                GLuint width0,
                                                GLuint height0,
                                                GLuint depth0,
-					       bool expect_accelerated_upload);
+					       bool expect_accelerated_upload,
+                                               GLuint num_samples);
 
 struct intel_mipmap_tree *
 intel_miptree_create_for_region(struct intel_context *intel,
@@ -250,7 +252,8 @@ struct intel_mipmap_tree*
 intel_miptree_create_for_renderbuffer(struct intel_context *intel,
                                       gl_format format,
                                       uint32_t width,
-                                      uint32_t height);
+                                      uint32_t height,
+                                      uint32_t num_samples);
 
 /** \brief Assert that the level and layer are valid for the miptree. */
 static inline void
@@ -341,7 +344,8 @@ intel_miptree_s8z24_gather(struct intel_context *intel,
 
 bool
 intel_miptree_alloc_hiz(struct intel_context *intel,
-			struct intel_mipmap_tree *mt);
+			struct intel_mipmap_tree *mt,
+                        GLuint num_samples);
 
 void
 intel_miptree_slice_set_needs_hiz_resolve(struct intel_mipmap_tree *mt,
diff --git a/src/mesa/drivers/dri/intel/intel_tex_image.c b/src/mesa/drivers/dri/intel/intel_tex_image.c
index 651095a..68f4ff4 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_image.c
+++ b/src/mesa/drivers/dri/intel/intel_tex_image.c
@@ -99,7 +99,8 @@ intel_miptree_create_for_teximage(struct intel_context *intel,
 			       width,
 			       height,
 			       depth,
-			       expect_accelerated_upload);
+			       expect_accelerated_upload,
+                               0 /* num_samples */);
 }
 
 /* There are actually quite a few combinations this will work for,
diff --git a/src/mesa/drivers/dri/intel/intel_tex_validate.c b/src/mesa/drivers/dri/intel/intel_tex_validate.c
index 256c21e..cadba29 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_validate.c
+++ b/src/mesa/drivers/dri/intel/intel_tex_validate.c
@@ -86,7 +86,8 @@ intel_finalize_mipmap_tree(struct intel_context *intel, GLuint unit)
                                           width,
                                           height,
                                           depth,
-					  true);
+					  true,
+                                          0 /* num_samples */);
       if (!intelObj->mt)
          return false;
    }