/*
 * tiler-nv12.c
 *
 * TILER driver NV12 area reservation functions for TI TILER hardware block.
 *
 * Author: Lajos Molnar <molnar@ti.com>
 *
 * Copyright (C) 2009-2010 Texas Instruments, Inc.
 *
 * This package is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 *
 * THIS PACKAGE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
 * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
 */

#include "_tiler.h"

static struct tiler_ops *ops;	/* shared methods and variables */
static int band_8;
static int band_16;

/*
 * NV12 Reservation Functions
 *
 * TILER is designed so that a (w * h) * 8bit area is twice as wide as a
 * (w/2 * h/2) * 16bit area.  Since having pairs of such 8-bit and 16-bit
 * blocks is a common usecase for TILER, we optimize packing these into a
 * TILER area.
 *
 * During reservation we want to find the most effective packing (most used area
 * in the smallest overall area)
 *
 * We have two algorithms for packing nv12 blocks: either pack 8- and 16-bit
 * blocks into separate container areas, or pack them together into same area.
 */

/**
 * Calculate effectiveness of packing. We weight total area much higher than
 * packing efficiency to get the smallest overall container use.
 *
 * @param w		width of one (8-bit) block
 * @param n		buffers in a packing
 * @param area		width of packing area
 * @param n_total	total number of buffers to be packed
 * @return effectiveness, the higher the better
 */
static inline u32 nv12_eff(u16 w, u16 n, u16 area, u16 n_total)
{
	return 0x10000000 -
		/* weigh against total area needed (for all buffers) */
		/* 64-slots = -2048 */
		DIV_ROUND_UP(n_total, n) * area * 32 +
		/* packing efficiency (0 - 1024) */
		1024 * n * ((w * 3 + 1) >> 1) / area;
}

/**
 * Fallback nv12 packing algorithm: pack 8 and 16 bit block into separate
 * areas.
 *
 * @author a0194118 (7/16/2010)
 *
 * @param o	desired offset (<a)
 * @param a	desired alignment (>=2)
 * @param w	block width (>0)
 * @param n	number of blocks desired
 * @param area	pointer to store total area needed
 *
 * @return number of blocks that can be allocated
 */
static u16 nv12_separate(u16 o, u16 a, u16 w, u16 n, u16 *area)
{
	tiler_best2pack(o, a, band_8, w, &n, area);
	tiler_best2pack(o >> 1, a >> 1, band_16, (w + 1) >> 1, &n, area);
	*area *= 3;
	return n;
}

/*
 * Specialized NV12 Reservation Algorithms
 *
 * We use 4 packing methods that pack nv12 blocks into the same area.  Together
 * these 4 methods give the optimal result for most possible input parameters.
 *
 * For now we pack into a 64-slot area, so that we don't have to worry about
 * stride issues (all blocks get 4K stride). For some of the algorithms this
 * could be true even if the area was 128.
 */

/**
 * Packing types are marked using a letter sequence, capital letters denoting
 * 8-bit blocks, lower case letters denoting corresponding 16-bit blocks.
 *
 * All methods have the following parameters. They also define the maximum
 * number of coordinates that could potentially be packed.
 *
 * @param o, a, w, n offset, alignment, width, # of blocks as usual
 * @param area		pointer to store area needed for packing
 * @param p		pointer to store packing coordinates
 * @return		number of blocks that can be packed
 */

/* Method A: progressive packing: AAAAaaaaBBbbCc into 64-slot area */
#define MAX_A 21
static int nv12_A(u16 o, u16 a, u16 w, u16 n, u16 *area, u8 *p)
{
	u16 x = o, u, l, m = 0;
	*area = band_8;

	while (x + w < *area && m < n) {
		/* current 8bit upper bound (a) is next 8bit lower bound (B) */
		l = u = (*area + x) >> 1;

		/* pack until upper bound */
		while (x + w <= u && m < n) {
			/* save packing */
			BUG_ON(m + 1 >= MAX_A);
			*p++ = x;
			*p++ = l;
			l = (*area + x + w + 1) >> 1;
			x = ALIGN(x + w - o, a) + o;
			m++;
		}
		x = ALIGN(l - o, a) + o;	/* set new lower bound */
	}
	return m;
}

/* Method -A: regressive packing: cCbbBBaaaaAAAA into 64-slot area */
static int nv12_revA(u16 o, u16 a, u16 w, u16 n, u16 *area, u8 *p)
{
	u16 m;

	/* this is a mirrored packing of method A */
	n = nv12_A((a - (o + w) % a) % a, a, w, n, area, p);

	/* reverse packing */
	for (m = 0; m < n; m++) {
		*p = *area - *p - w;
		p++;
		*p = *area - *p - ((w + 1) >> 1);
		p++;
	}
	return n;
}

/* Method B: simple layout: aAbcBdeCfgDhEFGH */
#define MAX_B 8
static int nv12_B(u16 o, u16 a, u16 w, u16 n, u16 *area, u8 *p)
{
	u16 e  = (o + w) % a;	/* end offset */
	u16 o1 = (o >> 1) % a;			/* half offset */
	u16 e1 = ((o + w + 1) >> 1) % a;	/* half end offset */
	u16 o2 = o1 + (a >> 2);			/* 2nd half offset */
	u16 e2 = e1 + (a >> 2);			/* 2nd half end offset */
	u16 m = 0;
	*area = band_8;

	/* ensure 16-bit blocks don't overlap 8-bit blocks */

	/* width cannot wrap around alignment, half block must be before block,
	   2nd half can be before or after */
	if (w < a && o < e && e1 <= o && (e2 <= o || o2 >= e))
		while (o + w <= *area && m < n) {
			BUG_ON(m + 1 >= MAX_B);
			*p++ = o;
			*p++ = o >> 1;
			m++;
			o += a;
		}
	return m;
}

/* Method C: butterfly layout: AAbbaaBB */
#define MAX_C 20
static int nv12_C(u16 o, u16 a, u16 w, u16 n, u16 *area, u8 *p)
{
	int m = 0;
	u16 o2, e = ALIGN(w, a), i = 0, j = 0;
	*area = band_8;
	o2 = *area - (a - (o + w) % a) % a;	/* end of last possible block */

	m = (min(o2 - 2 * o, 2 * o2 - o - *area) / 3 - w) / e + 1;
	for (i = j = 0; i < m && j < n; i++, j++) {
		BUG_ON(j + 1 >= MAX_C);
		*p++ = o + i * e;
		*p++ = (o + i * e + *area) >> 1;
		if (++j < n) {
			*p++ = o2 - i * e - w;
			*p++ = (o2 - i * e - w) >> 1;
		}
	}
	return j;
}

/* Method D: for large allocation: aA or Aa */
#define MAX_D 1
static int nv12_D(u16 o, u16 a, u16 w, u16 n, u16 *area, u8 *p)
{
	u16 o1, w1 = (w + 1) >> 1, d;
	*area = ALIGN(o + w, band_8);

	for (d = 0; n > 0 && d + o + w <= *area; d += a) {
		/* try to fit 16-bit before 8-bit */
		o1 = ((o + d) % band_8) >> 1;
		if (o1 + w1 <= o + d) {
			*p++ = o + d;
			*p++ = o1;
			return 1;
		}

		/* try to fit 16-bit after 8-bit */
		o1 += ALIGN(d + o + w - o1, band_16);
		if (o1 + w1 <= *area) {
			*p++ = o;
			*p++ = o1;
			return 1;
		}
	}
	return 0;
}

/**
 * Umbrella nv12 packing method. This selects the best packings from the above
 * methods.  It also contains hardcoded packings for parameter combinations
 * that have more efficient packings. This method provides is guaranteed to
 * provide the optimal packing if 2 <= a <= 64 and w <= 64 and n is large.
 */
#define MAX_ANY 21	/* must be MAX(method-MAX-s, hardcoded n-s) */
static u16 nv12_together(u16 o, u16 a, u16 w, u16 n, u16 *area, u8 *packing)
{
	u16 n_best, a_best, n2, a_, o_, w_;

	/* algo results (packings) */
	u8 pack_A[MAX_A * 2], pack_rA[MAX_A * 2];
	u8 pack_B[MAX_B * 2], pack_C[MAX_C * 2];
	u8 pack_D[MAX_D * 2];

	/*
	 * Hardcoded packings.  They are sorted by increasing area, and then by
	 * decreasing n.  We may not get the best efficiency if less than n
	 * blocks are needed as packings are not necessarily sorted in
	 * increasing order.  However, for those n-s one of the other 4 methods
	 * may return the optimal packing.
	 */
	u8 packings[] = {
		/* n=9, o=2, w=4, a=4, area=64 */
		9, 2, 4, 4, 64,
			/* 8-bit, 16-bit block coordinate pairs */
			2, 33,	6, 35,	10, 37,	14, 39,	18, 41,
			46, 23,	50, 25, 54, 27,	58, 29,
		/* o=0, w=12, a=4, n=3 */
		3, 0, 12, 4, 64,
			0, 32,	12, 38,	48, 24,
		/* end */
		0
	}, *p = packings, *p_best = NULL, *p_end;
	p_end = packings + sizeof(packings) - 1;

	/* see which method gives the best packing */

	/* start with smallest area algorithms A, B & C, stop if we can
	   pack all buffers */
	n_best = nv12_A(o, a, w, n, area, pack_A);
	p_best = pack_A;
	if (n_best < n) {
		n2 = nv12_revA(o, a, w, n, &a_best, pack_rA);
		if (n2 > n_best) {
			n_best = n2;
			p_best = pack_rA;
			*area = a_best;
		}
	}
	if (n_best < n) {
		n2 = nv12_B(o, a, w, n, &a_best, pack_B);
		if (n2 > n_best) {
			n_best = n2;
			p_best = pack_B;
			*area = a_best;
		}
	}
	if (n_best < n) {
		n2 = nv12_C(o, a, w, n, &a_best, pack_C);
		if (n2 > n_best) {
			n_best = n2;
			p_best = pack_C;
			*area = a_best;
		}
	}

	/* traverse any special packings */
	while (*p) {
		n2 = *p++;
		o_ = *p++;
		w_ = *p++;
		a_ = *p++;
		/* stop if we already have a better packing */
		if (n2 < n_best)
			break;

		/* check if this packing is satisfactory */
		if (a_ >= a && o + w + ALIGN(o_ - o, a) <= o_ + w_) {
			*area = *p++;
			n_best = min(n2, n);
			p_best = p;
			break;
		}

		/* skip to next packing */
		p += 1 + n2 * 2;
	}

	/*
	 * If so far unsuccessful, check whether 8 and 16 bit blocks can be
	 * co-packed.  This will actually be done in the end by the normal
	 * allocation, but we need to reserve a big-enough area.
	 */
	if (!n_best) {
		n_best = nv12_D(o, a, w, n, area, pack_D);
		p_best = NULL;
	}

	/* store best packing */
	if (p_best && n_best) {
		BUG_ON(n_best > MAX_ANY);
		memcpy(packing, p_best, n_best * 2 * sizeof(*pack_A));
	}

	return n_best;
}

/* reserve nv12 blocks */
static void reserve_nv12(u32 n, u32 width, u32 height,
					u32 gid, struct security_info *si)
{
	u16 w, h, band, a, o = 0, r = 0;
	struct gid_info *gi;
	int res = 0, res2, i;
	u16 n_t, n_s, area_t, area_s;
	u8 packing[2 * MAX_ANY];
	struct list_head reserved = LIST_HEAD_INIT(reserved);

	/* Check input parameters for correctness, and support */
	if (!width || !height || !n ||
	    n > ops->width * ops->height / 2)
		return;

	/* calculate dimensions, band, and alignment in slots */
	if (ops->analize(TILFMT_8BIT, width, height, &w, &h, &band, &a, &o, &r))
		return;

	/* get group context */
	gi = ops->get_gi(si, gid);
	if (!gi)
		return;

	/* reserve in groups until failed or all is reserved */
	for (i = 0; i < n && res >= 0; i += res) {
		/* check packing separately vs together */
		n_s = nv12_separate(o, a, w, n - i, &area_s);
		if (ops->nv12_packed)
			n_t = nv12_together(o, a, w, n - i, &area_t, packing);
		else
			n_t = 0;

		/* pack based on better efficiency */
		res = -1;
		if (!ops->nv12_packed ||
			nv12_eff(w, n_s, area_s, n - i) >
			nv12_eff(w, n_t, area_t, n - i)) {

			/*
			 * Reserve blocks separately into a temporary list, so
			 * that we can free them if unsuccessful. We need to be
			 * able to reserve both 8- and 16-bit blocks as the
			 * offsets of them must match.
			 */
			res = ops->lay_2d(TILFMT_8BIT, n_s, w, h, band_8, a, o,
						gi, &reserved);
			res2 = ops->lay_2d(TILFMT_16BIT, n_s, (w + 1) >> 1, h,
				band_16, a >> 1, o >> 1, gi, &reserved);

			if (res2 < 0 || res < 0 || res != res2) {
				/* clean up */
				ops->release(&reserved);
				res = -1;
			} else {
				/* add list to reserved */
				ops->add_reserved(&reserved, gi);
			}
		}

		/* if separate packing failed, still try to pack together */
		if (res < 0 && ops->nv12_packed && n_t) {
			/* pack together */
			res = ops->lay_nv12(n_t, area_t, w, h, gi, packing);
		}
	}

	ops->release_gi(gi);
}

/* initialize shared method pointers and global static variables */
void tiler_nv12_init(struct tiler_ops *tiler)
{
	ops = tiler;

	ops->reserve_nv12 = reserve_nv12;

	band_8 = PAGE_SIZE / ops->geom(TILFMT_8BIT)->slot_w
		/ ops->geom(TILFMT_8BIT)->bpp;
	band_16 = PAGE_SIZE / ops->geom(TILFMT_16BIT)->slot_w
		/ ops->geom(TILFMT_16BIT)->bpp;
}