#include <FastLED.h>
#include <lcdgfx.h>
// #include "lcdgfx_gui.h"

#define SPR_SIZE 18
#define TILE_SIZE 32
#define NUM_COLS 128
#define NUM_ROWS 128
#include "fish.h"

DisplayIL9163_128x128x16_SPI display(3,{-1, 4, 5, 0,-1,-1}); // Use this line for Atmega328p

void setup() {
  display.begin();
}


const uint8_t wu_weight(uint8_t a, uint8_t b) {
  return ((uint16_t) a * b + a + b) >> 8;
}

void loop() {
  uint32_t ms = millis();
  // we don't have enough RAM to hold the full image
  // each line is rendered and sent separately
  uint8_t line_buffer[NUM_COLS * 2];

  // https://wikipedia.org/wiki/Rotation_matrix
  int16_t xdx = cos16(ms * 3);
  int16_t xdy = -sin16(ms * 3);
  int16_t ydx = sin16(ms * 3);
  int16_t ydy = cos16(ms * 3);

  // move the centre of rotation
  uint32_t xstart = -xdx * (NUM_COLS + 1) / 2;
  uint32_t ystart = -ydx * (NUM_COLS + 1) / 2;
  xstart -= xdy * (NUM_ROWS + 1) / 2;
  ystart -= ydy * (NUM_ROWS + 1) / 2;
  
  for (byte row = 0; row < NUM_ROWS; row++) {
    uint8_t *rgb565 = line_buffer;
    uint32_t x = xstart;
    uint32_t y = ystart;
    for (byte column = 0; column < NUM_COLS; column++) {
      CRGB output = 0;
      if (ms & 2048) {
        // non super-sampled
        uint8_t xoffset = ((x >> 15) + (ms >> 5));
        uint8_t yoffset = y >> 15;

        // tile the SPR_SIZExSPR_SIZE sprite on a 32x32 grid
        xoffset %= TILE_SIZE;
        yoffset %= TILE_SIZE;
        // sample 1 pixel from the texture
        if (xoffset < SPR_SIZE && yoffset < SPR_SIZE)
          output =  pgm_read_dword (fish + xoffset + yoffset * SPR_SIZE);

      } else {
        // super-sampled (the values are 256x bigger, so the bottom 8-bits are the fractional part)
        uint16_t xoffset = (x >> 7) + (ms << 3);
        uint16_t yoffset = y >> 7;

        // extract the fractional parts and derive their inverses
        uint8_t xx = xoffset & 0xff, yy = yoffset & 0xff, ix = 255 - xx, iy = 255 - yy;

        // co-ordinates of the 4 texture samples
        uint8_t x0 = (xoffset >> 8) % TILE_SIZE;
        uint8_t y0 = (yoffset >> 8) % TILE_SIZE;
        uint8_t x1 = (x0 + 1) % TILE_SIZE;
        uint8_t y1 = (y0 + 1) % TILE_SIZE;

        // sample 4 texture pixels, scaling them by the Wu weight of the fractional offset
        // keep tiling the 18x18 texture lookups on a 32x32 grid
        CRGB sample;
        if (x0 < SPR_SIZE) {
          if (y0 < SPR_SIZE) {
            sample = pgm_read_dword_near(fish + x0 + y0 * SPR_SIZE);
            output.r = (sample.r * wu_weight(ix, iy)) >> 8;
            output.g = (sample.g * wu_weight(ix, iy)) >> 8;
            output.b = (sample.b * wu_weight(ix, iy)) >> 8;
          }
          if (y1 < SPR_SIZE) {
            sample = pgm_read_dword_near(fish + x0 + y1 * SPR_SIZE);
            output.r += (sample.r * wu_weight(ix, yy)) >> 8;
            output.g += (sample.g * wu_weight(ix, yy)) >> 8;
            output.b += (sample.b * wu_weight(ix, yy)) >> 8;
          }
        }
        if (x1 < SPR_SIZE) {
          if (y0 < SPR_SIZE) {
            sample = pgm_read_dword_near(fish + x1 + y0 * SPR_SIZE);
            output.r += (sample.r * wu_weight(xx, iy)) >> 8;
            output.g += (sample.g * wu_weight(xx, iy)) >> 8;
            output.b += (sample.b * wu_weight(xx, iy)) >> 8;
          }
          if (y1 < SPR_SIZE) {
            sample = pgm_read_dword_near(fish + x1 + y1 * SPR_SIZE);
            output.r += (sample.r * wu_weight(xx, yy)) >> 8;
            output.g += (sample.g * wu_weight(xx, yy)) >> 8;
            output.b += (sample.b * wu_weight(xx, yy)) >> 8;
          }
        }
      }
      // convert to lcdgfx's rgb565 format and swap endianness
      // uint16_t rgb16 = RGB_COLOR16(output.r, output.g, output.b);
      // *rgb++ = rgb16 >> 8;
      // *rgb++ = rgb16 & 0xff;

      *rgb565++ = (output.r & 0xf8) | (output.g >> 5);
      *rgb565++ = ((output.g << 3) & 0xe0) | (output.b >> 3);

      x += xdx; y+= ydx;
    }
    display.drawBuffer16(0, row, NUM_COLS, 1, line_buffer);
    xstart += xdy; ystart += ydy;
  }
  lcd_delay(20);
}