#include "c:\watcom\h\stdio.h"
#include "c:\watcom\h\dos.h"
#include "c:\watcom\h\math.h"
#include "c:\watcom\h\sys\types.h"
#include "c:\watcom\h\sys\stat.h"
#include "c:\watcom\h\fcntl.h"
#include "c:\watcom\h\stdlib.h"
#include "c:\watcom\h\conio.h"
#include "c:\watcom\h\string.h"
#include "c:\watcom\h\ctype.h"
#include "c:\watcom\h\io.h"
#include "c:\watcom\h\direct.h"

typedef			unsigned char		UBYTE;
typedef			signed char			SBYTE;
typedef			unsigned char		BYTE;
typedef			unsigned short int	UWORD;
typedef			signed short int		SWORD;
typedef			unsigned short int	WORD;
typedef			unsigned int		ULONG;
typedef			signed int			SLONG;
typedef			unsigned int		LONG;
typedef			unsigned int		UDWORD;
typedef			signed int			SDWORD;
typedef			unsigned int		DWORD;

UBYTE zigzag[64][2]=
{
	{0, 0},
    {0, 1}, {1, 0},
    {2, 0}, {1, 1}, {0, 2},
    {0, 3}, {1, 2}, {2, 1}, {3, 0},
    {4, 0}, {3, 1}, {2, 2}, {1, 3}, {0, 4},
    {0, 5}, {1, 4}, {2, 3}, {3, 2}, {4, 1}, {5, 0},
    {6, 0}, {5, 1}, {4, 2}, {3, 3}, {2, 4}, {1, 5}, {0, 6},
    {0, 7}, {1, 6}, {2, 5}, {3, 4}, {4, 3}, {5, 2}, {6, 1}, {7, 0},
    {7, 1}, {6, 2}, {5, 3}, {4, 4}, {3, 5}, {2, 6}, {1, 7},
    {2, 7}, {3, 6}, {4, 5}, {5, 4}, {6, 3}, {7, 2},
    {7, 3}, {6, 4}, {5, 5}, {4, 6}, {3, 7},
    {4, 7}, {5, 6}, {6, 5}, {7, 4},
    {7, 5}, {6, 6}, {5, 7},
    {6, 7}, {7, 6},
    {7, 7}
};


typedef signed int INT32;
typedef signed int DCTELEM;
typedef DCTELEM DCTBLOCK[8][8];
#define DCTSIZE 8

#define RIGHT_SHIFT(x,y) ((x)>>(y))

#define LG2_DCT_SCALE 16

#define ONE	((INT32) 1)

#define DCT_SCALE (ONE << LG2_DCT_SCALE)

#define LG2_OVERSCALE 2
#define OVERSCALE  (ONE << LG2_OVERSCALE)
#define OVERSHIFT(x)  ((x) <<= LG2_OVERSCALE)

/* Scale a fractional constant by DCT_SCALE */
#define FIX(x)	((INT32) ((x) * DCT_SCALE + 0.5))

/* Scale a fractional constant by DCT_SCALE/OVERSCALE */
/* Such a constant can be multiplied with an overscaled input */
/* to produce something that's scaled by DCT_SCALE */
#define FIXO(x)  ((INT32) ((x) * DCT_SCALE / OVERSCALE + 0.5))

/* Descale and correctly round a value that's scaled by DCT_SCALE */
#define UNFIX(x)   RIGHT_SHIFT((x) + (ONE << (LG2_DCT_SCALE-1)), LG2_DCT_SCALE)

/* Same with an additional division by 2, ie, correctly rounded UNFIX(x/2) */
#define UNFIXH(x)  RIGHT_SHIFT((x) + (ONE << LG2_DCT_SCALE), LG2_DCT_SCALE+1)

/* Take a value scaled by DCT_SCALE and round to integer scaled by OVERSCALE */
#define UNFIXO(x)  RIGHT_SHIFT((x) + (ONE << (LG2_DCT_SCALE-1-LG2_OVERSCALE)),\
				   LG2_DCT_SCALE-LG2_OVERSCALE)

/* Here are the constants we need */
/* SIN_i_j is sine of i*pi/j, scaled by DCT_SCALE */
/* COS_i_j is cosine of i*pi/j, scaled by DCT_SCALE */

#define SIN_1_4 FIX(0.707106781)
#define COS_1_4 SIN_1_4

#define SIN_1_8 FIX(0.382683432)
#define COS_1_8 FIX(0.923879533)
#define SIN_3_8 COS_1_8
#define COS_3_8 SIN_1_8

#define SIN_1_16 FIX(0.195090322)
#define COS_1_16 FIX(0.980785280)
#define SIN_7_16 COS_1_16
#define COS_7_16 SIN_1_16

#define SIN_3_16 FIX(0.555570233)
#define COS_3_16 FIX(0.831469612)
#define SIN_5_16 COS_3_16
#define COS_5_16 SIN_3_16

/* OSIN_i_j is sine of i*pi/j, scaled by DCT_SCALE/OVERSCALE */
/* OCOS_i_j is cosine of i*pi/j, scaled by DCT_SCALE/OVERSCALE */

#define OSIN_1_4 FIXO(0.707106781)
#define OCOS_1_4 OSIN_1_4

#define OSIN_1_8 FIXO(0.382683432)
#define OCOS_1_8 FIXO(0.923879533)
#define OSIN_3_8 OCOS_1_8
#define OCOS_3_8 OSIN_1_8

#define OSIN_1_16 FIXO(0.195090322)
#define OCOS_1_16 FIXO(0.980785280)
#define OSIN_7_16 OCOS_1_16
#define OCOS_7_16 OSIN_1_16

#define OSIN_3_16 FIXO(0.555570233)
#define OCOS_3_16 FIXO(0.831469612)
#define OSIN_5_16 OCOS_3_16
#define OCOS_5_16 OSIN_3_16







/*
 * Perform the forward DCT on one block of samples.
 *
 * A 2-D DCT can be done by 1-D DCT on each row
 * followed by 1-D DCT on each column.
 */

void j_fwd_dct (DCTBLOCK data)
{
  int pass, rowctr;
  register DCTELEM *inptr, *outptr;
  DCTBLOCK workspace;

  /* Each iteration of the inner loop performs one 8-point 1-D DCT.
   * It reads from a *row* of the input matrix and stores into a *column*
   * of the output matrix.  In the first pass, we read from the data[] array
   * and store into the local workspace[].  In the second pass, we read from
   * the workspace[] array and store into data[], thus performing the
   * equivalent of a columnar DCT pass with no variable array indexing.
   */

  inptr = &data[0][0];			/* initialize pointers for first pass */
  outptr = &workspace[0][0];
  for (pass = 1; pass >= 0; pass--) {
	for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
	  /* many tmps have nonoverlapping lifetime -- flashy register colourers
	   * should be able to do this lot very well
	   */
	  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
	  INT32 tmp10, tmp11, tmp12, tmp13;
	  INT32 tmp14, tmp15, tmp16, tmp17;
	  INT32 tmp25, tmp26;
	  //SHIFT_TEMPS

	  tmp0 = inptr[7] + inptr[0];
	  tmp1 = inptr[6] + inptr[1];
	  tmp2 = inptr[5] + inptr[2];
	  tmp3 = inptr[4] + inptr[3];
	  tmp4 = inptr[3] - inptr[4];
	  tmp5 = inptr[2] - inptr[5];
	  tmp6 = inptr[1] - inptr[6];
	  tmp7 = inptr[0] - inptr[7];

	  tmp10 = tmp3 + tmp0;
	  tmp11 = tmp2 + tmp1;
	  tmp12 = tmp1 - tmp2;
	  tmp13 = tmp0 - tmp3;

	  outptr[        0] = (DCTELEM) UNFIXH((tmp10 + tmp11) * SIN_1_4);
	  outptr[DCTSIZE*4] = (DCTELEM) UNFIXH((tmp10 - tmp11) * COS_1_4);

	  outptr[DCTSIZE*2] = (DCTELEM) UNFIXH(tmp13*COS_1_8 + tmp12*SIN_1_8);
	  outptr[DCTSIZE*6] = (DCTELEM) UNFIXH(tmp13*SIN_1_8 - tmp12*COS_1_8);

	  tmp16 = UNFIXO((tmp6 + tmp5) * SIN_1_4);
	  tmp15 = UNFIXO((tmp6 - tmp5) * COS_1_4);

	  OVERSHIFT(tmp4);
	  OVERSHIFT(tmp7);

	  /* tmp4, tmp7, tmp15, tmp16 are overscaled by OVERSCALE */

	  tmp14 = tmp4 + tmp15;
	  tmp25 = tmp4 - tmp15;
	  tmp26 = tmp7 - tmp16;
	  tmp17 = tmp7 + tmp16;

	  outptr[DCTSIZE  ] = (DCTELEM) UNFIXH(tmp17*OCOS_1_16 + tmp14*OSIN_1_16);
	  outptr[DCTSIZE*7] = (DCTELEM) UNFIXH(tmp17*OCOS_7_16 - tmp14*OSIN_7_16);
	  outptr[DCTSIZE*5] = (DCTELEM) UNFIXH(tmp26*OCOS_5_16 + tmp25*OSIN_5_16);
	  outptr[DCTSIZE*3] = (DCTELEM) UNFIXH(tmp26*OCOS_3_16 - tmp25*OSIN_3_16);

	  inptr += DCTSIZE;		/* advance inptr to next row */
	  outptr++;			/* advance outptr to next column */
	}
	/* end of pass; in case it was pass 1, set up for pass 2 */
	inptr = &workspace[0][0];
	outptr = &data[0][0];
  }
}

void j_rev_dct (DCTBLOCK data)
{
  int pass, rowctr;
  register DCTELEM *inptr, *outptr;
  DCTBLOCK workspace;

  /* Each iteration of the inner loop performs one 8-point 1-D IDCT.
   * It reads from a *row* of the input matrix and stores into a *column*
   * of the output matrix.  In the first pass, we read from the data[] array
   * and store into the local workspace[].  In the second pass, we read from
   * the workspace[] array and store into data[], thus performing the
   * equivalent of a columnar IDCT pass with no variable array indexing.
   */

  inptr = &data[0][0];			/* initialize pointers for first pass */
  outptr = &workspace[0][0];
  for (pass = 1; pass >= 0; pass--) {
	for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
	  /* many tmps have nonoverlapping lifetime -- flashy register colourers
	   * should be able to do this lot very well
	   */
	  INT32 in0, in1, in2, in3, in4, in5, in6, in7;
	  INT32 tmp10, tmp11, tmp12, tmp13;
	  INT32 tmp20, tmp21, tmp22, tmp23;
	  INT32 tmp30, tmp31;
	  INT32 tmp40, tmp41, tmp42, tmp43;
	  INT32 tmp50, tmp51, tmp52, tmp53;
	  //SHIFT_TEMPS

	  in0 = inptr[0];
	  in1 = inptr[1];
	  in2 = inptr[2];
	  in3 = inptr[3];
	  in4 = inptr[4];
	  in5 = inptr[5];
	  in6 = inptr[6];
	  in7 = inptr[7];

	  /* These values are scaled by DCT_SCALE */

	  tmp10 = (in0 + in4) * COS_1_4;
	  tmp11 = (in0 - in4) * COS_1_4;
	  tmp12 = in2 * SIN_1_8 - in6 * COS_1_8;
	  tmp13 = in6 * SIN_1_8 + in2 * COS_1_8;

	  tmp20 = tmp10 + tmp13;
	  tmp21 = tmp11 + tmp12;
	  tmp22 = tmp11 - tmp12;
	  tmp23 = tmp10 - tmp13;

	  /* These values are scaled by OVERSCALE */

	  tmp30 = UNFIXO((in3 + in5) * COS_1_4);
	  tmp31 = UNFIXO((in3 - in5) * COS_1_4);

	  OVERSHIFT(in1);
	  OVERSHIFT(in7);

	  tmp40 = in1 + tmp30;
	  tmp41 = in7 + tmp31;
	  tmp42 = in1 - tmp30;
	  tmp43 = in7 - tmp31;

	  /* And these are scaled by DCT_SCALE */

	  tmp50 = tmp40 * OCOS_1_16 + tmp41 * OSIN_1_16;
	  tmp51 = tmp40 * OSIN_1_16 - tmp41 * OCOS_1_16;
	  tmp52 = tmp42 * OCOS_5_16 + tmp43 * OSIN_5_16;
	  tmp53 = tmp42 * OSIN_5_16 - tmp43 * OCOS_5_16;

	  outptr[        0] = (DCTELEM) UNFIXH(tmp20 + tmp50);
	  outptr[DCTSIZE  ] = (DCTELEM) UNFIXH(tmp21 + tmp53);
	  outptr[DCTSIZE*2] = (DCTELEM) UNFIXH(tmp22 + tmp52);
	  outptr[DCTSIZE*3] = (DCTELEM) UNFIXH(tmp23 + tmp51);
	  outptr[DCTSIZE*4] = (DCTELEM) UNFIXH(tmp23 - tmp51);
	  outptr[DCTSIZE*5] = (DCTELEM) UNFIXH(tmp22 - tmp52);
	  outptr[DCTSIZE*6] = (DCTELEM) UNFIXH(tmp21 - tmp53);
	  outptr[DCTSIZE*7] = (DCTELEM) UNFIXH(tmp20 - tmp50);

	  inptr += DCTSIZE;		/* advance inptr to next row */
	  outptr++;			/* advance outptr to next column */
	}
	/* end of pass; in case it was pass 1, set up for pass 2 */
	inptr = &workspace[0][0];
	outptr = &data[0][0];
  }
}

FILE *outfi,*infi;


void pack(UBYTE *image, SLONG q, SLONG w, SLONG h)
{
	SLONG c0,c1,c2,c3,c4,c5;
	SLONG n=(w/8)*(h/8);
	DCTBLOCK *dct=(DCTBLOCK*)malloc(sizeof(DCTBLOCK)*n);
	DCTBLOCK qual;
	for (c3=0;c3<8;c3++)
	for (c4=0;c4<8;c4++)
	{
		qual[c3][c4]=(c3+c4)*q+3;
	}
	c0=0;
	for (c1=0;c1<h/8;c1++)
	for (c2=0;c2<w/8;c2++)
	{
		for (c3=0;c3<8;c3++)
		for (c4=0;c4<8;c4++)
		{
			dct[c0][c4][c3]=image[(c1*8+c3)*w+c2*8+c4];
		}
		j_fwd_dct(dct[c0]);
		for (c3=0;c3<8;c3++)
		for (c4=0;c4<8;c4++)
		{
			c5=floor(dct[c0][c4][c3]/qual[c4][c3]+0.5);
			if (c3!=0 || c4!=0)
			{
				if (c5<-128) c5=-128;
				if (c5>127) c5=127;
			}
			dct[c0][c4][c3]=c5;

		}
		c0++;
	}
	for (c1=n-1;c1>0;c1--) dct[c1][0][0]-=dct[c1-1][0][0];
	for (c5=63;c5>=0;c5--)
	{
		printf(".");flushall();
		c3=zigzag[c5][0];
		c4=zigzag[c5][1];
		if (c3==0 && c4==0)
		{
			for (c1=0;c1<n;c1+=2)
			{
				c2=((dct[c1+1][0][0]&0xfff)<<12)+(dct[c1][0][0]&0xfff);
				fwrite(&c2,1,3,outfi);
			}
		}
		else
		{
			for (c1=0;c1<n;c1++)
			{
				c2=(dct[c1][c3][c4]);
				if (c2<-128) c2=128;if (c2>127) c2=127;
				fwrite(&c2,1,1,outfi);
			}
		}
	}
	free(dct);
}

void unpack(UBYTE *image, SLONG q,SLONG w, SLONG h)
{
	SLONG c0,c1,c2,c3,c4,c5;
	SLONG n=(w/8)*(h/8);
	DCTBLOCK *dct=(DCTBLOCK*)malloc(sizeof(DCTBLOCK)*n);
	DCTBLOCK qual;
	for (c3=0;c3<8;c3++)
	for (c4=0;c4<8;c4++)
	{
		qual[c3][c4]=(c3+c4)*q+3;
	}

	for (c5=63;c5>=0;c5--)
	{
		c3=zigzag[c5][0];
		c4=zigzag[c5][1];
		if (c3==0 && c4==0)
		{
			for (c1=0;c1<n;c1+=2)
			{
				c2=0;
				fread(&c2,1,3,infi);
				dct[c1][0][0]=(c2<<20)>>20;
				dct[c1+1][0][0]=(c2<<8)>>20;
			}
		}
		else
		{
			for (c1=0;c1<n;c1++)
			{
				SBYTE d2;
				fread(&d2,1,1,infi);
				dct[c1][c3][c4]=d2;
			}
		}
	}
	for (c1=1;c1<n;c1++) dct[c1][0][0]+=dct[c1-1][0][0];

	c0=0;
	for (c1=0;c1<h/8;c1++)
	for (c2=0;c2<w/8;c2++)
	{
		for (c3=0;c3<8;c3++)
		for (c4=0;c4<8;c4++)
		{
			dct[c0][c4][c3]*=qual[c4][c3];
		}
		j_rev_dct(dct[c0]);
		for (c3=0;c3<8;c3++)
		for (c4=0;c4<8;c4++)
		{
			c5=dct[c0][c4][c3];
			if (c5<0) c5=0;if (c5>255) c5=255;
			image[(c1*8+c3)*w+c2*8+c4]=c5;
		}
		c0++;
	}
	free(dct);
}


UBYTE *image;


main(int argc, char *argv[])
{
	UBYTE header[64];
	DCTBLOCK dct,qual;
	SLONG c1,c2,c3,c4;

	for (c3=0;c3<8;c3++)
	for (c4=0;c4<8;c4++)
	{
		qual[c3][c4]=(c3+c4)*5+1;
	}

	if (argc<2)
	{
		printf("give me a filename with no extension: pack infile [quality]\n");
		exit(1);
	}
	char ss[128];
        for (c1=0;c1<strlen(argv[1]);c1++) if (argv[1][c1]=='.') argv[1][c1]=0;
        sprintf(ss,"%s.bmp",argv[1]);
	infi=fopen(ss,"rb");
	if (infi==NULL)
	{
		printf("file not found");
		exit(1);
	}
	fread(&header[0],54,1,infi);
	if (header[0]!=0x42 || header[1]!=0x4d)
	{
		printf("invalid bmp file");
		exit(1);
	}

	SLONG w=*((int*)(header+18));
	SLONG 	h=*((int*)(header+22));        
	SLONG bits=header[28];
	SLONG q=3;
	if (argc>2) q=atoi(argv[2]);
	printf("%d x %d x %d quality %d\n",w,h,bits,q);
        if (w&7 || h&7 || bits&7)
        {
                printf("resolution+bits per pixel must be a multiple of 8\n");
                exit(1);
        }
	bits/=8;
	if (bits==1) fseek(infi,1078,SEEK_SET);
	image=(UBYTE*)malloc(bits*w*h);
	UBYTE *lbuf=(UBYTE*)malloc(bits*w);
	for (c1=h-1;c1>=0;c1--)
	{
		fread(lbuf,bits,w,infi);
		UBYTE *lb=lbuf;
		for (c2=0;c2<w;c2++)
		{
			image[c1*w+c2+w*h*0]=*lb++;
			if (bits>1)
			{
				image[c1*w+c2+w*h*1]=*lb++;
				image[c1*w+c2+w*h*2]=*lb++;
			}

		}
	}
	fclose(infi);
	sprintf(ss,"%s.dct",argv[1]);
	outfi=fopen(ss,"wb");
	fwrite(&q,1,1,outfi);
	fwrite(&w,1,2,outfi);
	fwrite(&h,1,2,outfi);
	fwrite(&bits,1,1,outfi);
	pack(image,q,w,h*bits);
	fclose(outfi);
        
        infi=fopen(ss,"rb");
	unpack(image,q,w,h*bits);
	fclose(infi);
	outfi=fopen("out.raw","wb");
	fwrite(image,bits,w*h,outfi);
	fclose(outfi);
        
}
