#include <iostream>
#include <fstream>
#include <array>
#include <vector>
#include <cmath>
#include <chrono>
#include <list>
#include <cassert>
#include <unistd.h>
#include <iomanip>
#include <algorithm>
#include <thread>
//#include "mingw-std-threads-master/mingw.thread.h"
#include "optionparser-1.7/src/optionparser.h"



/*
----BBB 25 FS----
ffmpeg -i Source/BigBuckBunny_sunflower_1080p_60fps_normal.mp4 -r 25 -an -pix_fmt gray -vf "scale=256:192" -f rawvideo bbb25fs.raw
./rvcencode --start 1514 --frames 1091 --threads 8 --output seq --image fs bbb25fs.raw bbb25fs
ffmpeg -r 25 -i bbb25fs%06d.tga bbb25fs.gif
rm *.tga
./rvcencode --start 1514 --frames 1091 --threads 8  --image fs bbb25fs.raw bbb25fs.rvc
=> 1091 frames

----BBB 25 WS----
ffmpeg -i Source/BigBuckBunny_sunflower_1080p_60fps_normal.mp4 -r 25 -an -pix_fmt gray -vf "scale=256:144" -f rawvideo bbb25ws.raw
./rvcencode --start 1514 --frames 1091 --threads 8 --output seq bbb25ws.raw bbb25ws
ffmpeg -r 25 -i bbb25ws%06d.tga bbb25ws.gif
rm *.tga
./rvcencode --start 1514 --frames 1091 --threads 8 bbb25ws.raw bbb25ws.rvc
=> 1091 frames

----BBB 30 WS----
ffmpeg -i Source/BigBuckBunny_sunflower_1080p_60fps_normal.mp4 -r 30 -an -pix_fmt gray -vf "scale=256:144" -f rawvideo bbb30ws.raw
./rvcencode --start 1816 --frames 1312 --threads 8 --output seq bbb30ws.raw bbb30ws
ffmpeg -r 30 -i bbb30ws%06d.tga bbb30ws.gif
rm *.tga
./rvcencode --start 1816 --frames 1312 --threads 8 bbb30ws.raw bbb30ws.rvc
=> 1312 frames

----Caminandez 25 WS----
ffmpeg -i Source/caminandes_llamigos_1080p.mp4 -an -pix_fmt gray -vf "scale=256:144, eq=gamma=1.1:contrast=1.1" -f rawvideo caminandez25ws.raw
./rvcencode --threads 8 --output seq caminandez25ws.raw caminandez25ws
ffmpeg -r 25 -i caminandez25ws%06d.tga caminandez25ws.gif
rm *.tga
./rvcencode --threads 8 caminandez25ws.raw caminandez25ws.rvc
=> 3601 frames

----Bad Apple 25 FS----
ffmpeg -i Source/BadApple720p60fps.mp4 -r 25 -an -pix_fmt gray -vf "scale=256:192, eq=gamma=1.1:contrast=1.1" -f rawvideo ba25fs.raw
./rvcencode --threads 8 --image fs --output seq --dither bayer4 --frames 1000 ba25fs.raw ba
ffmpeg -r 25 -i ba25fs%06d.tga ba25fs.gif
rm *.tga
./rvcencode --threads 8 --image fs --dither bayer4 --frames 1000 ba25fs.raw ba25fs.rvc
=> 1000 frames
*/














using namespace std;

enum ditherMode_e
{
    DITHER_MODE_BAYER_8 = 1,
    DITHER_MODE_BAYER_4,
    DITHER_MODE_SIMPLE
};

enum imageMode_e
{
    IMAGE_MODE_4_3 = 1,
    IMAGE_MODE_16_9
};

enum outputMode_e
{
    OUTPUT_MODE_BIN = 1,
    OUTPUT_MODE_SEQ
};

string fileNameIn;
string fileNameOut;
uint32_t startImage;
uint32_t nrOfImagesToRender;
uint32_t nrOfFixedBlocks; 
uint32_t nrOfThreads;
uint32_t nrOfKMeanLoops;
uint32_t imageSizeX;
uint32_t imageSizeY;
uint32_t imageSizeCols;
uint32_t imageSizeRows;
uint32_t nrOfCellsImage;
uint32_t nrOfBlocksTotal;
uint32_t nrOfBlocksNew;
uint32_t nrOfBlocksOld;
uint32_t imageSizePixels;
imageMode_e imageMode;
outputMode_e outputMode;
ditherMode_e ditherMode;
constexpr uint32_t NR_OF_PIXELS_IN_BLOCK = 8 * 8;
using pixelData_t = uint8_t;
vector<pixelData_t> imageIn;
vector<pixelData_t> imageOut;
using imageBlock_t = array<pixelData_t, NR_OF_PIXELS_IN_BLOCK>;
vector<imageBlock_t> blockData;
vector<imageBlock_t> blockDataDithered;
vector<vector<double>> blockDistance;
vector<vector<uint32_t>> blockList;
vector<uint32_t> imageBlocksBest;

array<array<pixelData_t, 8>, 8> ditherMatrix8 =
{
    {
        {255* 0/64, 255*48/64, 255*12/64, 255*60/64, 255* 3/64, 255*51/64, 255*15/64, 255*63/64},
        {255*32/64, 255*16/64, 255*44/64, 255*28/64, 255*35/64, 255*19/64, 255*47/64, 255*31/64},
        {255* 8/64, 255*56/64, 255* 4/64, 255*52/64, 255*11/64, 255*59/64, 255* 7/64, 255*55/64},
        {255*40/64, 255*24/64, 255*36/64, 255*20/64, 255*43/64, 255*27/64, 255*39/64, 255*23/64},
        {255* 2/64, 255*50/64, 255*14/64, 255*62/64, 255* 1/64, 255*49/64, 255*13/64, 255*61/64},
        {255*34/64, 255*18/64, 255*46/64, 255*30/64, 255*33/64, 255*17/64, 255*45/64, 255*29/64},
        {255*10/64, 255*58/64, 255* 6/64, 255*54/64, 255* 9/64, 255*57/64, 255* 5/64, 255*53/64},
        {255*42/64, 255*26/64, 255*38/64, 255*22/64, 255*41/64, 255*25/64, 255*37/64, 255*21/64}
    }
};

array<array<pixelData_t, 4>, 4> ditherMatrix4 =
{
    {
        {255* 0/16, 255* 8/16, 255* 2/16, 255*10/16},
        {255*12/16, 255* 4/16, 255*14/16, 255* 6/16},
        {255* 3/16, 255*11/16, 255* 1/16, 255* 9/16},
        {255*15/16, 255* 7/16, 255*13/16, 255* 5/16}
    }
};

enum TgaError
{
	TGA_OK = 0,
    TGA_FILE_NOT_OPENED = 1	
};

struct
{
    uint8_t  idLength = 0;
    uint8_t  colorMapType = 0;
    uint8_t  imageType = 3;
    //Color map specification
    uint8_t entryIndexLo = 0;
    uint8_t entryIndexHi = 0;
    uint8_t colorMaplengthLo = 0;
    uint8_t colorMaplengthHi = 0;
    uint8_t colorMapEntrySize = 0;
    //Image specification
    uint16_t xOrigin = 0;
    uint16_t yOrigin = 0;
    uint16_t width = 0;
    uint16_t height = 0;
    uint8_t  pixelDepth = 8;
    uint8_t  descriptor = 0;        
} tgaHeader;

//################################                
//# Write one in a sequence of .tga files
//################################
uint32_t tgaWriteSequence(uint32_t imageNr)
{    
    stringstream fileNameWithNr;
    //create a filename with a 6-digit sequential number, leading zeroes
    fileNameWithNr << fileNameOut << setw(6) << setfill('0') << to_string(imageNr) << ".tga";    
    // Open the image file in binary mode
    ofstream f_img(fileNameWithNr.str(), ios::binary);        
    if (!f_img.is_open())
        return TGA_FILE_NOT_OPENED;        
    //write tga file header
    f_img.write(reinterpret_cast<const char*>(&tgaHeader), sizeof(tgaHeader));
    //write image
    for(int32_t i = imageSizeY - 1; i >= 0; --i)
    {
        f_img.write(reinterpret_cast<const char*>(&imageOut[i  * imageSizeX]), imageSizeX);
    }
    //Done!
    f_img.close();
    return TGA_OK;
}    

//################################                
//# Write to .RVC file
//################################
uint32_t binWrite(uint32_t currentImage)
{
    // Open the image file in binary mode, write to end of file
    ofstream f_img(fileNameOut, ios::app | ios::binary);    
    if (!f_img.is_open())
    {
        return 1;
    }
    //When doing ws, the first 1024 bytes needs to be a full (128) block definition. 
    //Needed for filling the cache-blocks.
    if((imageMode == IMAGE_MODE_16_9) && (currentImage == 0))
    {        
        for(uint32_t block = 0; block < nrOfBlocksTotal; ++block)
        {
            for(uint32_t y = 0; y < 8; ++y)
            {
                uint8_t byteToWrite = 0;
                for(uint32_t x = 0; x < 8; ++x)
                {   
                    if(blockDataDithered[block][(y * 8) + x] == 255)
                    {
                        byteToWrite = byteToWrite << 1;
                    }
                    else
                    {
                        byteToWrite = (byteToWrite << 1) | 1;
                    }
                }              
                //char data for blocks 64-127 must be inverted
                if(block > 63) 
                {
                    byteToWrite ^= 255;
                }
                f_img << byteToWrite;
            }
        }            
    }
    //write image, 768 or 1024 bytes
    for(uint32_t i = 0; i < nrOfCellsImage; ++i)
    {
        uint8_t byteToWrite = imageBlocksBest[i] - nrOfCellsImage;
        if(imageMode == IMAGE_MODE_16_9)
        {
            if((byteToWrite & 31) < 24)
            {
                //brute force WHERE to write depending on moving cache
                byteToWrite = ((((currentImage & 3) - (byteToWrite >> 5)) & 3) << 5) + (byteToWrite & 31);
            }
        }
        else//(imageMode == IMAGE_MODE_4_3)
        {
            //brute force WHERE to write depending on moving cache
            byteToWrite = ((((currentImage & 3) - (byteToWrite >> 5)) & 3) << 5) + (byteToWrite & 31);
        }
        //blocks 64-127 are actually 192-255 on the ZX81
        if(byteToWrite > 63)
        {
            byteToWrite = (byteToWrite & 63) + 128;
        }
        f_img << static_cast<unsigned char>(byteToWrite);
    }
    //write block definitions, 32 or 24
    for(uint32_t block = 0; block < nrOfBlocksNew; ++block)
    {
        for(uint32_t y = 0; y < 8; ++y)
        {
            uint8_t byteToWrite = 0;
            for(uint32_t x = 0; x < 8; ++x)
            {   
                if(blockDataDithered[block][(y * 8) + x] == 255)
                {
                    byteToWrite = byteToWrite << 1;
                }
                else
                {
                    byteToWrite = (byteToWrite << 1) | 1;
                }
            }            
            //character definitions during image 2 & 3 has to be inverted
            //image 0 & 1 are unaffected
            if((currentImage % 4) >= 2)
            {
                byteToWrite ^= 255;
            }
            f_img << byteToWrite;
        }
    }    
    //Done!
    f_img.close();
	return TGA_OK;
}

double distanceCalc(array <pixelData_t, NR_OF_PIXELS_IN_BLOCK> &dataList0, array <pixelData_t, NR_OF_PIXELS_IN_BLOCK> &dataList1)
{    
    //mean absolute error
    double temp = 0;
    for(uint32_t i = 0; i < NR_OF_PIXELS_IN_BLOCK; ++i)
    {
        temp += abs(dataList0[i] - dataList1[i]);
    }
    return temp / NR_OF_PIXELS_IN_BLOCK;    
}

//################################                
//# Init greyscale blocks for 16:9 mode.
//# These are fixed (cache) blocks, sent at the beginning of the stream.
//################################
void initBlocks_16_9(void)
{
    uint8_t blockFill;
    for(uint32_t i = 0; i < 8; ++i)
    {
        blockFill = (255 * i / (32 - 1));
        blockData[nrOfCellsImage + 32 - 8 + i - 0].fill(blockFill);
    }
    for(uint32_t i = 8; i < 16; ++i)
    {
        blockFill = (255 * i / (32 - 1));
        blockData[nrOfCellsImage + 64 - 8 + i - 8].fill(blockFill);
    }
    for(uint32_t i = 16; i < 24; ++i)
    {
        blockFill = (255 * i / (32 - 1));
        blockData[nrOfCellsImage + 96 - 8 + i - 16].fill(blockFill);
    }
    for(uint32_t i = 24; i < 32; ++i)
    {
        blockFill = (255 * i / (32 - 1));
        blockData[nrOfCellsImage + 128 - 8 + i - 24].fill(blockFill);
    }
}

//################################                
//# Re-use old blocks from previous images.
//# The same operation is done on the ZX81 so the data does not have to be sent again.
//################################
void reUseOldBlocks(void)
{
    int32_t block;
    if(imageMode == IMAGE_MODE_4_3)
    {
        for(block = (nrOfCellsImage + nrOfBlocksTotal - 1); block >= (nrOfCellsImage + nrOfBlocksNew); --block)
        {
            blockData[block] = blockData[block - nrOfBlocksNew];
        }
    }
    else// if(imageMode == IMAGE_MODE_16_9)
    {
        for(block = 0; block < nrOfBlocksNew; ++block)
        {
            blockData[nrOfCellsImage + 96 + block] = blockData[nrOfCellsImage + 64 + block];
            blockData[nrOfCellsImage + 64 + block] = blockData[nrOfCellsImage + 32 + block];
            blockData[nrOfCellsImage + 32 + block] = blockData[nrOfCellsImage + 0 + block];
        }
    }
    return;
}        

//################################                
//# Create blockdata from image
//# These are the uncompressed, complete image blocks
//################################
void createBlockData(void)
{
    uint32_t block;
    uint32_t row;
    uint32_t col;
    for (block = 0; block < nrOfCellsImage; ++block)
    {
        for (row = 0; row <= 7; ++row)
        {            
            for (col = 0; col <= 7; ++col)
            {            
                blockData[block][(row * 8) + col] = 
                    imageIn[
                        ((block / 32) * imageSizeX * 8) +
                        ((block % 32) * 8) + 
                        (row * imageSizeX) + 
                        col
                        ];
            }
        }
    }
    return;
}    

//################################                
//# Create distance matrix image.
//# Does the actual work.
//################################
void createDistanceMatrixImageThread(uint32_t blockStart, uint32_t blockStop)
{
    uint32_t blockX;
    uint32_t blockY;
    double distance;
    for (blockY = blockStart; blockY < blockStop; ++blockY)
    {
        for (blockX = blockY; blockX < nrOfCellsImage; ++blockX)
        {
            if (blockX != blockY)
            {
                distance = distanceCalc(blockData[blockX], blockData[blockY]);
                blockDistance[blockX][blockY] = distance;
                blockDistance[blockY][blockX] = distance;
            }
            else
            {
                blockDistance[blockX][blockY] = 0.0;
            } 
        }
    }
}

//################################                
//# Create threads to calculate the distance matrix for image.
//################################
void createDistanceMatrixImage(void)
{
    thread t[nrOfThreads];
    for (int i = 0; i < nrOfThreads; ++i) 
    {
        t[i] = thread(createDistanceMatrixImageThread,
            nrOfCellsImage * i / nrOfThreads,
            nrOfCellsImage * (i + 1) / nrOfThreads);
    }
    
    for (int i = 0; i < nrOfThreads; ++i) 
    {
        t[i].join();
    }
    return;
}   

//################################                
//# Create distance matrix old.
//# Does the actual work.
//################################
void createDistanceMatrixOldThread(uint32_t blockStart, uint32_t blockStop)
{
    uint32_t blockX;
    uint32_t blockY;
    double distance;
    for (blockY = blockStart; blockY < blockStop; ++blockY)
    {
        for (blockX = (nrOfCellsImage + nrOfBlocksNew); blockX < (nrOfCellsImage + nrOfBlocksTotal); ++blockX)
        {
            distance = distanceCalc(blockData[blockX], blockData[blockY]);
            blockDistance[blockY][blockX] = distance;
        }
    }
}

//################################                
//# Create threads to calculate the distance matrix for old blocks.
//################################
void createDistanceMatrixOld(void)
{
    thread t[nrOfThreads];
    for (int i = 0; i < nrOfThreads; ++i) 
    {
        t[i] = thread(createDistanceMatrixOldThread, 
            nrOfCellsImage * i / nrOfThreads,
            nrOfCellsImage * (i + 1) / nrOfThreads);                 
    }
    
    for (int i = 0; i < nrOfThreads; ++i) 
    {
        t[i].join();
    }
    return;
}   

//################################                
//# Create distance matrix new.
//# Does the actual work.
//################################
void createDistanceMatrixNewThread(uint32_t blockStart, uint32_t blockStop)
{
    uint32_t blockX;
    uint32_t blockY;
    double distance;
    for (blockY = blockStart; blockY < blockStop; ++blockY)
    {
        for (blockX = nrOfCellsImage; blockX < (nrOfCellsImage + nrOfBlocksNew); ++blockX)
        {
            distance = distanceCalc(blockData[blockX], blockData[blockY]);
            blockDistance[blockY][blockX] = distance;
        }
    }
}

//################################                
//# Create threads to calculate the distance matrix for new blocks.
//################################
void createDistanceMatrixNew(void)
{
    thread t[nrOfThreads];
    for (int i = 0; i < nrOfThreads; ++i) 
    {
        t[i] = thread(createDistanceMatrixNewThread, 
            nrOfCellsImage * i / nrOfThreads,
            nrOfCellsImage * (i + 1) / nrOfThreads); 
    }
    
    for (int i = 0; i < nrOfThreads; ++i) 
    {
        t[i].join();
    }
    return;
}   

//################################                
//# Create imageBlocksBest
//# Map all the blocks in the image to which best block it is represented by.
//################################
void createImageBlocksBest(void)
{
    uint32_t i;
    uint32_t j;
    uint32_t chosenBlock;
    uint32_t destinationBlock;
    uint32_t counter = 0;
    for(i = 0; i < blockList.size(); ++i)
    {
        chosenBlock = blockList[i][0];
        for(j = 0; j < blockList[i].size(); ++j)
        {
            destinationBlock = blockList[i][j];
            if(destinationBlock < nrOfCellsImage)
            {
                imageBlocksBest[destinationBlock] = chosenBlock;
                ++counter;
            }
        }
    }
    return;
}

//################################                
//# Create mean blocks for each new-block group.
//# Averages each pixel for each position of all the blocks in a group.
//# The new average block represents the group.
//################################
void createMeanBlocks(uint32_t currentImage)
{
    uint32_t groupStart = 0;
    if((nrOfFixedBlocks != 0) && ((currentImage % 4) == 0))
    {
        groupStart = nrOfFixedBlocks;        
    }
    for(uint32_t currentGroup = groupStart; currentGroup < nrOfBlocksNew; ++currentGroup)
    {
        array<uint32_t, NR_OF_PIXELS_IN_BLOCK> meanBlock = {};
        for(uint32_t currentBlockInGroup = 0; currentBlockInGroup < blockList[currentGroup].size(); ++currentBlockInGroup)
        {
            for(uint32_t i = 0; i < NR_OF_PIXELS_IN_BLOCK; ++i)
            {
                meanBlock[i] += blockData[blockList[currentGroup][currentBlockInGroup]][i];
            }
        }
        for(uint32_t i = 0; i < NR_OF_PIXELS_IN_BLOCK; ++i)
        {
            meanBlock[i] = round(meanBlock[i] / blockList[currentGroup].size());
            blockData[blockList[currentGroup][0]][i] = meanBlock[i];   
        }        
    }
    return;
}    

//################################                
//# Re-create image from blockdata.
//# Used in the process of writing a tga file.
//################################
void reCreateImage(void)
{
    for (uint32_t block = 0; block < nrOfCellsImage; ++block)
    {
        for (uint32_t row = 0; row <= 7; ++row)
        {            
            for (uint32_t col = 0; col <= 7; ++col)
            {            
                imageOut[
                            ((block / 32) * imageSizeX * 8) +
                            ((block % 32) * 8) + 
                            (row * imageSizeX) + 
                            col
                        ] = blockData[imageBlocksBest[block]][(row * 8) + col];                    
            }
        }
    }
    return;
}    

//################################                
//# Dither the image that will be written as a tga file.
//################################
void ditherImage(ditherMode_e ditherMode)
{
    if(ditherMode == DITHER_MODE_BAYER_8)
    {
        for(uint32_t y = 0; y < imageSizeY; ++y)
        {
            for(uint32_t x = 0; x < imageSizeX; ++x)
            {
                if(imageOut[x + (y * imageSizeX)] > ditherMatrix8[y % 8][x % 8])
                {
                    imageOut[x + (y * imageSizeX)] = 255;
                }
                else
                {
                    imageOut[x + (y * imageSizeX)] = 0;
                }
            }
        }
    }
    else if(ditherMode == DITHER_MODE_BAYER_4)
    {
        for(uint32_t y = 0; y < imageSizeY; ++y)
        {
            for(uint32_t x = 0; x < imageSizeX; ++x)
            {
                if(imageOut[x + (y * imageSizeX)] > ditherMatrix4[y % 4][x % 4])
                {
                    imageOut[x + (y * imageSizeX)] = 255;
                }
                else
                {
                    imageOut[x + (y * imageSizeX)] = 0;
                }
            }
        }
    }
    else//if(ditherMode == DITHER_MODE_SIMPLE)
    {
        for(uint32_t y = 0; y < imageSizeY; ++y)
        {
            for(uint32_t x = 0; x < imageSizeX; ++x)
            {
                if(imageOut[x + (y * imageSizeX)] > 127)
                {
                    imageOut[x + (y * imageSizeX)] = 255;
                }
                else
                {
                    imageOut[x + (y * imageSizeX)] = 0;
                }
            }
        }
    }
    return;
}

//################################                
//# Create blockDataDithered.
//# Before writing an RVC file, the best blocks has to be dithered.
//################################
void createBlockDataDithered(ditherMode_e ditherMode)
{
    if(ditherMode == DITHER_MODE_BAYER_8)
    {
        for(uint32_t block = 0; block < nrOfBlocksTotal; ++block)
        {
            for(uint32_t i = 0; i < NR_OF_PIXELS_IN_BLOCK; ++i)
            {
                if(blockData[blockList[block][0]][i] > ditherMatrix8[8 * i / (NR_OF_PIXELS_IN_BLOCK - 1)][i % 8])
                {
                    blockDataDithered[block][i] = 255;
                }
                else
                {
                    blockDataDithered[block][i] = 0;
                }
            }
        }
    }
    else if(ditherMode == DITHER_MODE_BAYER_4)
    {
        for(uint32_t block = 0; block < nrOfBlocksTotal; ++block)
        {
            for(uint32_t i = 0; i < NR_OF_PIXELS_IN_BLOCK; ++i)
            {
                if(blockData[nrOfCellsImage + block][i] > ditherMatrix4[4 * i / (NR_OF_PIXELS_IN_BLOCK - 1)][i % 4])
                {
                    blockDataDithered[block][i] = 255;
                }
                else
                {
                    blockDataDithered[block][i] = 0;
                }
            }
        }
    }
    else//if(ditherMode == DITHER_MODE_SIMPLE)
    {
        for(uint32_t block = 0; block < nrOfBlocksTotal; ++block)
        {
            for(uint32_t i = 0; i < NR_OF_PIXELS_IN_BLOCK; ++i)
            {
                if(blockData[nrOfCellsImage + block][i] > 127)
                {
                    blockDataDithered[block][i] = 255;
                }
                else
                {
                    blockDataDithered[block][i] = 0;
                }
            }
        }
    }
    return;
}

//################################                
//# Init blockList, total (new+old).
//# Needs to be the correct length and have the correct group-heads.
//################################
void blockListInitTotal(void)
{
    blockList.clear();
    blockList.resize(nrOfBlocksTotal);
    for(uint32_t i = 0; i < nrOfBlocksTotal; ++i)
    {
        blockList[i].push_back(nrOfCellsImage + i);
    }
}

//################################                
//# K-means++ algorithm to choose the best blocks to represent the image.
//################################
void chooseKMeansPlusPlus(uint32_t currentImage)
{
    double distance;
    double distanceMax;
    double distanceMin;
    uint32_t blockMin;
    uint32_t blockMax;
    uint32_t blockFromImage;
    uint32_t blockTest;
    uint32_t blockStart;
    
    blockListInitTotal();
    
    //Choose the first new block(s), which is part of the ++ of this algorithm.
    //By choosing wisely, the K-means loop produces much better result much faster.
    
    //Fullscreen has a number of fixed blocks, and every 4th image gets some pre-calculated greyscale blocks.
    if((nrOfFixedBlocks != 0) && ((currentImage % 4) == 0))
    {
        for(uint32_t i = 0; i < nrOfFixedBlocks; ++i)
        {
            blockMin = nrOfCellsImage + i;
            blockData[blockMin].fill(255 * i / (nrOfFixedBlocks - 1));
            blockList[i].push_back(blockMin);
            for (uint32_t blockY = 0; blockY < nrOfCellsImage; ++blockY)
            {
                distance = distanceCalc(blockData[blockY], blockData[blockMin]);
                blockDistance[blockY][blockMin] = distance;
            }
        }
        blockStart = nrOfFixedBlocks;
    }
    //Widescreen has a special for its very first image.
    //Chooses the block with most close friends.
    else if((nrOfFixedBlocks == 0) && (currentImage == 0))
    {
        //blockMax = 1 + rand()/((RAND_MAX + 1u) / nrOfCellsImage);
        distanceMin = 100000.0;
        for(blockFromImage = 0; blockFromImage < nrOfCellsImage; ++blockFromImage)
        {
            distance = 0.0;
            for(uint32_t blockOld = 0; blockOld < nrOfCellsImage; ++blockOld)
            {
                //Only count the distance for the closest neighbours (5% of max distance).
                if(blockDistance[blockFromImage][blockOld] < (255 * 0.05))
                {
                    distance += blockDistance[blockFromImage][blockOld];
                }
            }
            if(distance < distanceMin)
            {
                distanceMin = distance;
                blockMax = blockFromImage;
            }
        }              
        //Insert into the blockList
        blockList[0].push_back(blockMax);
        //Update the distances for this newly chosen block
        for (uint32_t blockY = 0; blockY < nrOfCellsImage; ++blockY)
        {
            distance = blockDistance[blockY][blockMax];
            blockDistance[blockY][nrOfCellsImage + 0] = distance;
        }
        blockStart = 1;        
    }
    //For all the other images:
    //Choose the block that is farthest from any other, including the old blocks (cache).
    else
    {
        distanceMax = -1000000.0;
        for(blockFromImage = 0; blockFromImage < nrOfCellsImage; ++blockFromImage)
        {
            distance = 1000000.0;
            for(uint32_t blockOld = (nrOfCellsImage + nrOfBlocksNew); blockOld < (nrOfCellsImage + nrOfBlocksTotal); ++blockOld)
            {
                distance = min(distance, blockDistance[blockFromImage][blockOld]);
            }
            if(distance > distanceMax)
            {
                distanceMax = distance;
                blockMax = blockFromImage;
            }
        }        
        //Insert into the blockList
        blockList[0].push_back(blockMax);
        //Update the distances for this newly chosen block
        for (uint32_t blockY = 0; blockY < nrOfCellsImage; ++blockY)
        {
            distance = blockDistance[blockY][blockMax];
            blockDistance[blockY][nrOfCellsImage + 0] = distance;
        }
        blockStart = 1;
    }            
    
    //And for the rest of the best blocks,
    //choose the ones that are farthest from any other, 
    //including the old blocks (cache).
    for(uint32_t blockNewSet = blockStart; blockNewSet < nrOfBlocksNew; ++blockNewSet)
    {
        distanceMax = -1000000.0;
        for(uint32_t i = 0; i < nrOfCellsImage; ++i)
        {
            distance = 1000000.0;
            //check distance among the old blocks
            for(blockTest = (nrOfCellsImage + nrOfBlocksNew); blockTest < (nrOfCellsImage + nrOfBlocksTotal); ++blockTest)
            {
                distance = min(distance, blockDistance[i][blockTest]);
            }                    
            //check distance among the already chosen new blocks
            for(blockTest = 0; blockTest < blockNewSet; ++blockTest)
            {            
                distance = min(distance, blockDistance[i][blockList[blockTest][0]]);                
            }
            if(distance > distanceMax)
            {
                distanceMax = distance;
                blockMax = i;
            }
        }
        //Insert into the blockList        
        blockList[blockNewSet].push_back(blockMax);
        //Update the distances for this newly chosen block
        for (uint32_t blockY = 0; blockY < nrOfCellsImage; ++blockY)
        {
            distance = blockDistance[blockY][blockMax];
            blockDistance[blockY][nrOfCellsImage + blockNewSet] = distance;
        }
    }
    
    //Here is the K-means loop that 
    for(uint32_t kMeanLoop = 0; kMeanLoop < nrOfKMeanLoops; ++ kMeanLoop)
    {
        //Calculate mean blocks for the chosen groups
        createMeanBlocks(currentImage);        
        //update the distance matrix for the new group heads
        createDistanceMatrixNew();        
        //re-init blockList
        blockListInitTotal();        
        //Re-assign all image-blocks to the closest among new and old blocks.
        uint32_t bestGroup;
        for (uint32_t block = 0; block < nrOfCellsImage; ++block)
        {
            distanceMin = 1000000.0;
            for (uint32_t group = nrOfCellsImage; group < (nrOfCellsImage + nrOfBlocksTotal); ++group)
            {
                distance = blockDistance[block][group];
                if(distance < distanceMin)
                {
                    distanceMin = distance;
                    bestGroup = group - nrOfCellsImage;
                }
            }
            blockList[bestGroup].push_back(block);
        }
    }
    return;
}

//################################                
//# Configuration for the command line options
//################################
struct Arg: public option::Arg
{
    static void printError(const char* msg1, const option::Option& opt, const char* msg2)
    {
        fprintf(stderr, "%s", msg1);
        fwrite(opt.name, opt.namelen, 1, stderr);
        fprintf(stderr, "%s", msg2);
    }
    static option::ArgStatus Unknown(const option::Option& option, bool msg)
    {
        if (msg) printError("Unknown option '", option, "'\n");
        return option::ARG_ILLEGAL;
    }
    static option::ArgStatus NonEmpty(const option::Option& option, bool msg)
    {
        if (option.arg != 0 && option.arg[0] != 0)
        return option::ARG_OK;

        if (msg) printError("Option '", option, "' requires a non-empty argument\n");
        return option::ARG_ILLEGAL;
    }    
    static option::ArgStatus Numeric(const option::Option& option, bool msg)
    {
        char* endptr = 0;
        if (option.arg != 0 && strtol(option.arg, &endptr, 10)){};
        if (endptr != option.arg && *endptr == 0)
        return option::ARG_OK;
        if (msg) printError("Option '", option, "' requires a numeric argument\n");
        return option::ARG_ILLEGAL;
    }
};

enum  optionIndex {UNKNOWN, HELP, VERSION, OUTPUT, IMAGE, DITHER, THREADS, START, FRAMES, LOOPS};
const option::Descriptor usage[] =
{
    {UNKNOWN, 0,"" , ""    ,        Arg::None,     "USAGE: rvcencode [options] inFile outFile\n"
                                                   "Options:"},
    {HELP,    0,"h", "help",        Arg::None,     "  -h\n"
                                                   "  --help\n"
                                                   "          Print usage and exit\n"},
    {VERSION, 0,"v","version",      Arg::None,     "  -v\n"
                                                   "  --version\n"
                                                   "          Print program version info and exit\n"},
    {OUTPUT,  0,"o","output",   Arg::NonEmpty,     "  -o <mode>\n"
                                                   "  --output <mode>\n"
                                                   "          Output modes:\n"
                                                   "          rvc = ZX81 RVC file (default)\n"
                                                   "          seq = automatically numbered TGA files\n"},
    {IMAGE,  0,"i", "image",    Arg::NonEmpty,     "  -i <mode>\n"
                                                   "  --image <mode>\n"
                                                   "          Image modes:\n"
                                                   "          ws = widescreen (default)\n"
                                                   "          fs = fullscreen\n"},
    {DITHER,  0,"d","dither",   Arg::NonEmpty,     "  -d <mode>\n"
                                                   "  --dither <mode>\n"
                                                   "          Dither modes:\n"
                                                   "          bayer8 = Ordered Bayer 8x8 (default)\n"
                                                   "          bayer4 = Ordered Bayer 4x4\n"
                                                   "          simple = 50% cut-off mode\n"},                                                                                                      
    {THREADS, 0,"t","threads",   Arg::Numeric,     "  -t <num>\n"
                                                   "  --threads <num>\n"
                                                   "          Number of threads to use\n"
                                                   "          1 to 32, default is 1\n"},
    {START,   0,"s","start",     Arg::Numeric,     "  -s <num>\n"
                                                   "  --start <num>\n"
                                                   "          Which frame to start on\n"
                                                   "          Default is the first frame\n"},
    {FRAMES,  0,"f","frames",    Arg::Numeric,     "  -f <num>\n"
                                                   "  --frames <num>\n"
                                                   "          Number of frames to encode\n"
                                                   "          Default is to the last frame\n"},
    {LOOPS,   0,"l","loops",     Arg::Numeric,     "  -l <num>\n"
                                                   "  --loops <num>\n"
                                                   "          Number of optimization loops\n"
                                                   "          1 to 30, default is 5\n"},                                                   
    {UNKNOWN, 0,"" ,     "",        Arg::None,     "inFile:\n"
                                                   "          Full path to the file to encode.\n"
                                                   "          Requires a raw 8-bit greyscale video file\n"
                                                   "          at 256x192 or 256x144 pixels.\n\n"
                                                   "outFile:\n"
                                                   "          Full path and filename."},
    {0,0,0,0,0,0}
};

//################################                
//# The main program
//################################
int main(int argc, char **argv)
{    
    uint64_t startTime;
    uint64_t stopTime;
    
    //First off, a lot of command line option choices.
    argc-=(argc>0); argv+=(argc>0); // skip program name argv[0] if present
    option::Stats  stats(usage, argc, argv);
    option::Option options[stats.options_max], buffer[stats.buffer_max];
    option::Parser parse(usage, argc, argv, options, buffer);

    if (parse.error())
    {
        option::printUsage(cout, usage);
        return 1;
    }
    
    if (options[HELP] || argc == 0)
    {
        option::printUsage(cout, usage);
        return 0;
    }
    
    if (options[VERSION])
    {
        cout << "rvcencode version 0.3\n" << "built " << __DATE__ << " " <<  __TIME__ << endl;    
        return 0;
    }
    
    if (options[OUTPUT])
    {
        if(options[OUTPUT].arg == string{"rvc"})
        {
            outputMode = OUTPUT_MODE_BIN;
        }
        else if(options[OUTPUT].arg == string{"seq"})
        {
            outputMode = OUTPUT_MODE_SEQ;
        }
        else
        {
            cout << "Wrong output mode" << endl;
            return 1;
        }
    }
    else
    {
        outputMode = OUTPUT_MODE_BIN;
    }
    
    if (options[IMAGE])
    {
        if(options[IMAGE].arg == string{"ws"})
        {
            imageMode = IMAGE_MODE_16_9;
        }
        else if(options[IMAGE].arg == string{"fs"})
        {
            imageMode = IMAGE_MODE_4_3;
        }
        else
        {
            cout << "Wrong image mode" << endl;
            return 1;
        }
    }
    else
    {
        imageMode = IMAGE_MODE_16_9;
    }
    
    if (options[DITHER])
    {
        if(options[DITHER].arg == string{"bayer8"})
        {
            ditherMode = DITHER_MODE_BAYER_8;
        }
        else if(options[DITHER].arg == string{"bayer4"})
        {
            ditherMode = DITHER_MODE_BAYER_4;
        }
        else if(options[DITHER].arg == string{"simple"})
        {
            ditherMode = DITHER_MODE_SIMPLE;
        }
        else
        {
            cout << "Wrong dither mode" << endl;
            return 1;
        }
    }
    else
    {
        ditherMode = DITHER_MODE_BAYER_8;
    }
    
    if (options[THREADS])
    {
        nrOfThreads = atoi(options[THREADS].arg);
        if((nrOfThreads == 0) || (nrOfThreads > 32))
        {
            cout << "Wrong number of threads" << endl;
            return 1;
        }
    }
    else
    {
        nrOfThreads = 1;
    }
    
    if (options[START])
    {
        startImage = atoi(options[START].arg);
    }
    else
    {
        startImage = 0;
    }
    
    if (options[FRAMES])
    {
        nrOfImagesToRender = atoi(options[FRAMES].arg);
    }
    else
    {
        nrOfImagesToRender = 0;
    }
    
    if (options[LOOPS])
    {        
        nrOfKMeanLoops = atoi(options[LOOPS].arg);
        if((nrOfKMeanLoops < 1) || (nrOfKMeanLoops > 30))
        {
            cout << "Wrong number of loops" << endl;
            return 1;
        }
    }
    else
    {
        nrOfKMeanLoops = 5;
    }
    
    if (parse.nonOptionsCount() == 2)
    {        
        fileNameIn = parse.nonOption(0);
        fileNameOut = parse.nonOption(1);     
    }
    else if (parse.nonOptionsCount() == 1)
    {
        cout << "Missing outFile" << endl;
        return 1;
    }
    else if (parse.nonOptionsCount() == 0)
    {
        cout << "Missing inFile and outFile" << endl;
        return 1;
    }
    else
    {
        option::printUsage(cout, usage);
        return 1;
    }
    
    if(imageMode == IMAGE_MODE_4_3)
        if(ditherMode == DITHER_MODE_BAYER_8)
        {
            nrOfFixedBlocks = 16;
        }
        //If we run in 4:3 and dither badly, let's skip many fixed blocks.
        //It's probably Bad Apple anyway :)
        else
        {
            nrOfFixedBlocks = 2;
        }
    else//IMAGE_MODE_16_9
        nrOfFixedBlocks = 0;
    imageSizeX = 256;
    if(imageMode == IMAGE_MODE_4_3)
        imageSizeY = 192;
    else//IMAGE_MODE_16_9
        imageSizeY = 144;
    tgaHeader.height = imageSizeY;
    tgaHeader.width = imageSizeX;
    imageSizeCols = imageSizeX / 8;
    imageSizeRows = imageSizeY / 8;
    nrOfCellsImage = imageSizeCols * imageSizeRows;
    nrOfBlocksTotal = 128;
    if(imageMode == IMAGE_MODE_4_3)
    {
        nrOfBlocksNew = 32;
    }
    else//IMAGE_MODE_16_9
    {
        nrOfBlocksNew = 24;    
    }
    //nrOfBlocksLocked = 32;
    nrOfBlocksOld = nrOfBlocksTotal - nrOfBlocksNew;
    imageSizePixels = imageSizeX * imageSizeY;
    
    //init vectors
    imageIn.resize(imageSizePixels);        
    imageOut.resize(imageSizePixels);
    blockData.resize(nrOfCellsImage + nrOfBlocksTotal);
    blockDataDithered.resize(nrOfBlocksTotal);
    if(imageMode == IMAGE_MODE_16_9)
    {
        initBlocks_16_9();
    }
    imageBlocksBest.resize(nrOfCellsImage);
    blockDistance.resize(nrOfCellsImage);
    for(uint32_t y = 0; y < nrOfCellsImage; ++y)
    {
        blockDistance[y].resize(nrOfCellsImage + nrOfBlocksTotal);
    }

    //Truncate the outFile in RVC mode
    if(outputMode == OUTPUT_MODE_BIN)
    {
        ofstream outFile(fileNameOut, ios::trunc | ios::binary);    
        if (!outFile.is_open())
        {
            return 1;
        }
        outFile.close();        
    }
        
    //Open image file
    ifstream imageFileIn(fileNameIn, ios::binary);
    if (!imageFileIn.is_open())
    {
        cout << "failed to open " << fileNameIn << '\n';
    }     
    
    imageFileIn.seekg(0, ios::end);
    uint32_t nrOfImagesInFile = (imageFileIn.tellg() / imageSizePixels);
    if(startImage > nrOfImagesInFile)
    {
        cout << "ERROR: start image was beyond the end of the video file!" << endl;
        exit(1);
    }
    if((nrOfImagesToRender == 0) || (nrOfImagesToRender > (nrOfImagesInFile - startImage)))
    {
        nrOfImagesToRender = nrOfImagesInFile - startImage;
    }
    
    //Seek to start image
    imageFileIn.seekg(imageSizePixels * startImage);
        
    //the main loop to produce images
    for(uint32_t currentImage = 0; currentImage < nrOfImagesToRender; currentImage += 1)
    {
        startTime = chrono::duration_cast<chrono::milliseconds>(chrono::system_clock::now().time_since_epoch()).count();
        
        //Status counter to know:
        // -how many images are completed
        // -of how many total
        cout << "Image: " << startImage + currentImage + 1 << " / " << startImage + nrOfImagesToRender;
        //...continued further down
                
        //################################                
        //# Read image data from file
        //################################        
        imageFileIn.read(reinterpret_cast<char*>(&imageIn[0]), imageIn.size());        
        
        createBlockData();
        createDistanceMatrixImage();
        reUseOldBlocks();        
        createDistanceMatrixOld();
        
        chooseKMeansPlusPlus(currentImage);
        createImageBlocksBest();        
        
        //################################
        //# Write result to file
        //################################
        if(outputMode == OUTPUT_MODE_BIN)
        {
            createBlockDataDithered(ditherMode);        
            binWrite(currentImage);
        }
        else//if(outputMode == OUTPUT_MODE_SEQ)
        {
            reCreateImage();
            ditherImage(ditherMode);
            tgaWriteSequence(currentImage);
        }
        
        stopTime = chrono::duration_cast<chrono::milliseconds>(chrono::system_clock::now().time_since_epoch()).count();
        //Status counter continued:
        // -how much time last image took to calculate        
        cout << " (" << to_string(stopTime - startTime) << " ms)        " << "\r" << flush;       
    }
    //All done, return to the user.
    return 0;
}
