#ifdef TMAP_PER_ALPHA

#include "fix.h"
#include "gr.h"
#include "texmap.h"
#include "texmapl.h"


/* We want the rounding form of the conversion instruction, not the
   cropping form that C emits for float->long conversions.  */
static inline long cvttq(float in)
{
#if 0
  long out;
  asm("cvttq %1,%0" : "=f"(out) : "f"(in));
  return out;
#else
  /* The compiler of course doesn't know how to schedule asm blocks.
     So emit a cropping insn for now, and fix up the assembly file.  */
  return in;
#endif
}

static inline void ua_store(void *pos, unsigned long data)
{
  struct foo { unsigned int x __attribute__((packed)); };
  ((struct foo *)pos)->x = data;
}

static inline unsigned long ua_load(void *pos)
{
  struct foo { unsigned int x __attribute__((packed)); };
  return ((struct foo *)pos)->x;
}

#define DO_FP_CALCS				\
do						\
  {						\
    float tmpu, tmpv;				\
    tmpu = f_invz64 * f_u;			\
    f_u += f_dudx4;				\
    tmpv = f_invz64 * f_v;			\
    f_v += f_dvdx4;				\
    f_z += f_dzdx4;				\
    ubyz4 = cvttq(tmpu);			\
    vbyz4 = cvttq(tmpv);			\
  }						\
while (0)

#define ENCODE_UVZERO \
  (((ubyz4 + 1) & 0x3ff0) | (((vbyz4 + 1) & 0x3ff0) << 38))

#define NORMALIZE_UVZERO(X) \
  ((((X) >> (44 - 6)) & 07700) | (((X) >> 6) & 077))

#ifdef HISTOGRAM
static struct {
  long total;
  long hist[34];
  long ain[4];
  long aout[4];
} know;

static void __attribute__((destructor))
lemme_know(void)
{
  int i;
  double t = know.total;

  printf("total\t\t%9ld\n", know.total);

  for (i = 1; i <= 32; ++i)
    printf("size %3d\t%.3f\n", i, know.hist[i-1]/t);
  printf("sz 32-64\t%.3f\n", know.hist[32]/t);
  printf("sz >= 64\t%.3f\n", know.hist[33]/t);

  for (i = 0; i < 4; ++i)
    printf("ain %d\t%.3f\n", i, know.ain[i]/t);
  for (i = 0; i < 4; ++i)
    printf("aout %d\t%.3f\n", i, know.aout[i]/t);
}
#endif

void c_tmap_scanline_per(void)
{
  /* Multiply dudx, dvdx, and dzdx by four, converted into floating point.  */
  float const f_dudx = (float)fx_du_dx;
  float const f_dvdx = (float)fx_dv_dx;
  float const f_dzdx = (float)fx_dz_dx;
  float const f_dudx4 = f_dudx * 4.0f;
  float const f_dvdx4 = f_dvdx * 4.0f;
  float const f_dzdx4 = f_dzdx * 4.0f;

  /* Copy these global array pointers into registers.  */
  unsigned char * const r_pixptr = pixptr;
  unsigned char * const r_fadetbl = gr_fade_table;

  float f_u, f_v, f_z, f_invz64;
  unsigned char *dest, *lastpixel;
  unsigned long ubyz4, vbyz4, uvzero;
  unsigned long l, dldx, left;
  
  /* Push v, u, z, and 64.0 onto the FPU stack, and then start
     calculating the first 64 / z.  */

  f_z = fx_z;
  f_u = fx_u;
  f_v = fx_v;
  f_invz64 = 64.0f / f_z;

  /* Find the current scanline and assorted pointers.  */

  dest = (unsigned char *)write_buffer + bytes_per_row * fx_y;

#ifdef HISTOGRAM
  { 
    know.total += 1;
    l = fx_xright - fx_xleft + 1;
    if (l > 64)
      l = 34;
    else if (l > 32)
      l = 33;
    know.hist[l-1] += 1;
    know.ain[((unsigned long)dest + fx_xleft) & 3] += 1;
    know.aout[((unsigned long)dest + fx_xleft + fx_xright + 1) & 3] += 1;
  }
#endif

  /* Meanwhile, get l and dldx (again, the latter multiplied by four).
     The original values are divided by 256 so that the byte needed for
     the fade table offset will be aligned.  */

  l = fx_l;
  dldx = fx_dl_dx;
  l >>= 16 - 8;
  dldx >>= 16 - 8 - 2;

  /* Back to the scanline bits... */

  lastpixel = dest + fx_xright + 1;
  dest += fx_xleft;

  if (dest >= lastpixel)
    return;

  /* Calculate round(64 * u / z) and round(64 * v / z), store, and
     increment u, v, and z. Then start calculating the second 64 / z.  */

  DO_FP_CALCS;
  f_invz64 = 64.0f / f_z;

  /* Get our u/z and v/z values, and lop off the bits we don't care about.  */

  uvzero = ENCODE_UVZERO;

  /* Are there at least four pixels to draw? If not, skip to the
     epilog code.  */

  lastpixel -= 4;
  if (dest <= lastpixel)
    {
      /* Do we need to test for transparencies?  */
      if (!Transparency_on)
	{
	  do
	    {
	      unsigned char *offset_fadetbl;
	      unsigned long uv0, uv1, uv2, uv3;
	      unsigned long nuv0, nuv1, nuv2, nuv3;
	      unsigned long pp0, pp1, pp2, pp3;
	      unsigned long res0, res1, res2, res3;
	      unsigned long duvz;

	      /* While the FPU is busy dividing, the latest u/z and v/z
		 values are retrieved, packed, and stored in u/vzero (to
		 be used again in the next iteration).  The old u/vzero
		 values, which contains the values for pixel 0, gets
		 subtracted from the new u/vzero value to determine the
		 total change in u/z and v/z across the four pixels.

		 This is divided by 4 to get the average, which is used to
		 estimate the values for pixels 1, 2, and 3.  */

	      DO_FP_CALCS;
	      f_invz64 = 64.0f / f_z;

	      uv0 = uvzero;
	      nuv0 = NORMALIZE_UVZERO(uv0);
	      pp0 = r_pixptr[nuv0];

	      offset_fadetbl = r_fadetbl + (l & 0xff00);
	      l += dldx;

	      res0 = offset_fadetbl[pp0];

	      uvzero = ENCODE_UVZERO;
	      duvz = ((uvzero | 0x40000000) - uv0) >> 2;

	      uv1 = uv0 + duvz;
	      uv2 = uv1 + duvz;
	      uv3 = uv2 + duvz;

	      nuv1 = NORMALIZE_UVZERO(uv1);
	      nuv2 = NORMALIZE_UVZERO(uv2);
	      nuv3 = NORMALIZE_UVZERO(uv3);

	      pp1 = r_pixptr[nuv1];
	      pp2 = r_pixptr[nuv2];
	      pp3 = r_pixptr[nuv3];

	      res1 = (unsigned long)offset_fadetbl[pp1] << 8;
	      res2 = (unsigned long)offset_fadetbl[pp2] << 16;
	      res3 = (unsigned long)offset_fadetbl[pp3] << 24;

	      /* Pack and store RES[0-3].  */
	      ua_store(dest, res0 | res1 | res2 | res3);
	      dest += 4;
	    }
	  while (dest <= lastpixel);
	}
      else
	{
	  do
	    {
	      unsigned char *offset_fadetbl;
	      unsigned long uv0, uv1, uv2, uv3;
	      unsigned long nuv0, nuv1, nuv2, nuv3;
	      unsigned long pp0, pp1, pp2, pp3;
	      unsigned long res0, res1, res2, res3;
	      unsigned long duvz;
	      unsigned long orig, o0, o1, o2, o3;

	      /* While the FPU is busy dividing, the latest u/z and v/z
		 values are retrieved, packed, and stored in u/vzero (to
		 be used again in the next iteration).  The old u/vzero
		 values, which contains the values for pixel 0, gets
		 subtracted from the new u/vzero value to determine the
		 total change in u/z and v/z across the four pixels.

		 This is divided by 4 to get the average, which is used to
		 estimate the values for pixels 1, 2, and 3.  */

	      DO_FP_CALCS;
	      f_invz64 = 64.0f / f_z;

	      orig = ua_load(dest);

	      uv0 = uvzero;
	      nuv0 = NORMALIZE_UVZERO(uv0);
	      pp0 = r_pixptr[nuv0];

	      offset_fadetbl = r_fadetbl + (l & 0xff00);
	      l += dldx;
	      o0 = orig & 0xff;

	      res0 = offset_fadetbl[pp0];
	      if (pp0 == TRANSPARENCY_COLOR)
		res0 = o0;

	      uvzero = ENCODE_UVZERO;
	      duvz = ((uvzero | 0x40000000) - uv0) >> 2;

	      uv1 = uv0 + duvz;
	      uv2 = uv1 + duvz;
	      uv3 = uv2 + duvz;

	      nuv1 = NORMALIZE_UVZERO(uv1);
	      nuv2 = NORMALIZE_UVZERO(uv2);
	      nuv3 = NORMALIZE_UVZERO(uv3);

	      pp1 = r_pixptr[nuv1];
	      pp2 = r_pixptr[nuv2];
	      pp3 = r_pixptr[nuv3];

	      o1 = (orig >> 8) & 0xff;
	      o2 = (orig >> 16) & 0xff;
	      o3 = (orig >> 24) & 0xff;

	      res1 = offset_fadetbl[pp1];
	      res2 = offset_fadetbl[pp2];
	      res3 = offset_fadetbl[pp3];

	      if (pp1 == TRANSPARENCY_COLOR)
		res1 = o1;
	      if (pp2 == TRANSPARENCY_COLOR)
		res2 = o2;
	      if (pp3 == TRANSPARENCY_COLOR)
		res3 = o3;

	      res1 <<= 8;
	      res2 <<= 16;
	      res3 <<= 24;

	      /* Pack and store RES[0-3].  */
	      ua_store(dest, res0 | res1 | res2 | res3);
	      dest += 4;
	    }
	  while (dest <= lastpixel);
	}
    }

  /* Are there any pixels left at all? */

  lastpixel += 4;
  left = lastpixel - dest;
  if (left)
    {
      unsigned char *offset_fadetbl;
      unsigned long uv0, duvz;

      if (left > 3)
	*(char *)0 = 0;

      /* Here we finish off the last one-to-three pixels assigned to us.
	 Rather than calculating values for all four pixels, we just divide
	 the difference by four and keep adding this average into the value
	 as needed. (This code is not particularly optimized, by the way,
	 since it represents such a miniscule amount of the running time.  */

      DO_FP_CALCS;

      offset_fadetbl = r_fadetbl + (l & 0xff00);
      uv0 = uvzero;
      uvzero = ENCODE_UVZERO;
      duvz = ((uvzero | 0x40000000) - uv0) >> 2;

      if (left & 2)
	{
	  unsigned long uv1, nuv0, nuv1, pp0, pp1;

	  uv1 = uv0 + duvz;
	  nuv0 = NORMALIZE_UVZERO(uv0);
	  nuv1 = NORMALIZE_UVZERO(uv1);
	  uv0 = uv1 + duvz;

	  pp0 = r_pixptr[nuv0];
	  pp1 = r_pixptr[nuv1];

	  if (pp0 != TRANSPARENCY_COLOR)
	    dest[0] = offset_fadetbl[pp0];
	  if (pp1 != TRANSPARENCY_COLOR)
	    dest[1] = offset_fadetbl[pp1];
	  dest += 2;
	}
      if (left & 1)
	{
	  unsigned long nuv0, pp0;

	  nuv0 = NORMALIZE_UVZERO(uv0);
	  pp0 = r_pixptr[nuv0];
	  if (pp0 != TRANSPARENCY_COLOR)
	    *dest = offset_fadetbl[pp0];
	}
    }
}

#endif /* TMAP_PER_ALPHA */
