// Conway's Game of Life
// by jimLee
// https://forum.arduino.cc/t/the-fastest-way-to-move-a-block-of-ram/865841

// I don't have any LCDs, so I added TV out support so I could test on a Mega

// Initial performance at 128x96 PAL on AVR:
// 1697ms to step + 420ms to paint == 0.47 FPS

// Using #pragma GCC optimize "-Ofast" and removing a test from paintGrid(false) got:
// 1078ms to step + 160ms to paint == 0.81 FPS

// Next, I swapped the grid arrays to be [y][x] rather than [x][y]
// paintRasterGrid() relies on this layout. stepTime() got faster too.
// 980ms to step + 61ms to paint == 0.96 FPS

// stepTime() writes 1-bit at a time.
// stepTime8/16/32() coalesce 8-, 16- or 32-bits into a single write
// On AVR, 8 bits is fastest. On Teensy, 32 will probably be quickest. Maybe 16.
// 823ms to step + 60ms to paint == 1.13 FPS


#define TV_OUT

#ifdef TV_OUT
#include <TVout.h>
TVout TV;
#define GRID_X 128
#define GRID_Y 96
#define whitePixel(x, y) TV.set_pixel(y, x, 1)
#define blackPixel(x, y) TV.set_pixel(y, x, 0)

#else
#include <adafruit_1947_Obj.h>
#include <screen.h>
#define GRID_X 320
#define GRID_Y 240
#define whitePixel(x, y) screen->drawPixel(x, y, &white)
#define blackPixel(x, y) screen->drawPixel(x, y, &black)
#endif


const byte BYTES_X = GRID_X / 8;
byte grid[GRID_Y][BYTES_X];
byte tempGrid[GRID_Y][BYTES_X];
long bytes = BYTES_X * GRID_Y;

void setup() {
  Serial.begin(115200);
  Serial.println("Hello?");

#ifdef TV_OUT
  TV.begin(PAL);
  TV.clear_screen();
#else
  if (!initScreen(ADAFRUIT_1947,ADA_1947_SHIELD_CS,PORTRAIT)) {
    Serial.println(F("Screen failed, halting program."));
    while(true);                  //  kill the program.
  }
  screen->fillScreen(&black);
#endif
  clearGrid();
  randomSeed(analogRead(A0));
  // randomFill(10000);               // Put something out there to watch.
  randomFill2();

  setGrid(grid, 13, 12, true); // Flyer
  setGrid(grid, 14, 13, true);
  setGrid(grid, 15, 13, true);
  setGrid(grid, 13, 14, true);
  setGrid(grid, 14, 14, true);

  setGrid(grid, 34, 44, true); // Med spaceship
  setGrid(grid, 35, 44, true);
  setGrid(grid, 36, 44, true);
  setGrid(grid, 37, 44, true);
  setGrid(grid, 38, 44, true);
  setGrid(grid, 33, 45, true);
  setGrid(grid, 38, 45, true);
  setGrid(grid, 38, 46, true);
  setGrid(grid, 33, 47, true);
  setGrid(grid, 37, 47, true);
  setGrid(grid, 35, 48, true);

  setGrid(grid, 63, 70, true); // Blinker. Toad?
  setGrid(grid, 64, 70, true);
  setGrid(grid, 65, 70, true);
  setGrid(grid, 64, 71, true);
  setGrid(grid, 65, 71, true);
  setGrid(grid, 66, 71, true);

  // paintGrid(true);
  paintRasterGrid(grid);
}


void randomFill(int numDots) {

  int x;
  int y;

  for (int i = 0; i < numDots; i++) {
    x = random(0, GRID_X);
    y = random(0, GRID_Y);
    setGrid(grid, x, y, true);
  }
}


void randomFill2() {
  byte* gPtr;

  gPtr = (byte*) &grid[0][0];
  for (int i = 0; i < bytes; i++) {
    gPtr[i] = random(255);
  }
}


void clearGrid(void) {

  byte* gPtr;

  gPtr = (byte*) &grid[0][0];
  for (int i = 0; i < GRID_X / 8 * GRID_Y; i++) {
    gPtr[i] = 0;
  }
}


void gridToTemp(void) {

  void* gPtr;
  void* tPtr;

  gPtr = (void*) &grid[0][0];
  tPtr = (void*) &tempGrid[0][0];
  memcpy(tPtr, gPtr, bytes);
}


void tempToGrid(void) {

  byte* gPtr;
  byte* tPtr;

  gPtr = (byte*) &grid[0][0];
  tPtr = (byte*) &tempGrid[0][0];
  memcpy(gPtr, tPtr, bytes);
}

#pragma GCC push_options
#pragma GCC optimize "-Ofast"

void  setGrid(byte grid[][BYTES_X], int x, int y, bool value) {

  int   xIndex;
  byte  xBit;

  if (x < 0 || x >= GRID_X) return;
  if (y < 0 || y >= GRID_Y) return;
  xIndex = x >> 3;
  // xBit = x - (xIndex << 3);
  xBit = x & 7;
  if (value) {
    bitSet(grid[y][xIndex], xBit);
  } else {
    bitClear(grid[y][xIndex], xBit);
  }
}


bool getGrid(byte grid[][BYTES_X], int x, int y) {

  int   xIndex;
  byte  xBit;
  bool  result;

  result = false;
  if (x < 0 || x >= GRID_X) return result;
  if (y < 0 || y >= GRID_Y) return result;
  xIndex = x >> 3;
  // xBit = x - (xIndex << 3);
  xBit = x & 7;
  result = (bool) bitRead(grid[y][xIndex], xBit);
  return result;
}



// Any live cell with two or three live neighbours survives.
// Any dead cell with three live neighbours becomes a live cell.
// All other live cells die in the next generation.
// Similarly, all other dead cells stay dead.
bool updatePoint(int x, int y) {

  byte  numLiveCells;
  int   xMinus;
  int   xPlus;

  numLiveCells = 0;
  xMinus = x - 1;
  xPlus = x + 1;
  y--;
  numLiveCells = numLiveCells + (int)getGrid(grid, xMinus, y);
  numLiveCells = numLiveCells + (int)getGrid(grid, x, y);
  numLiveCells = numLiveCells + (int)getGrid(grid, xPlus, y);
  y++;
  numLiveCells = numLiveCells + (int)getGrid(grid, xMinus, y);
  numLiveCells = numLiveCells + (int)getGrid(grid, xPlus, y);
  y++;
  numLiveCells = numLiveCells + (int)getGrid(grid, xMinus, y);
  numLiveCells = numLiveCells + (int)getGrid(grid, x, y);
  numLiveCells = numLiveCells + (int)getGrid(grid, xPlus, y);
  y--;
  if (getGrid(grid, x, y)) {
    if (numLiveCells == 2 || numLiveCells == 3) {
      return true;
    } else {
      return false;
    }
  } else {
    if (numLiveCells == 3) {
      return true;
    }
  }
  return false;
}


void stepTime(void) {

  bool  result;

  gridToTemp();
  for (int y = 0; y < GRID_Y; y++) {
    for (int x = 0; x < GRID_X; x++) {
      result = updatePoint(x, y);
      setGrid(tempGrid, x, y, result);
    }
  }
}


void stepTime8() {
  byte *dst = (byte *) tempGrid;
  for (uint16_t y = 0; y < GRID_Y; y++) {
    for (uint16_t x = 0; x < GRID_X / 8; x++) {
      byte res = 0;
      for (uint8_t bitpos = 8; bitpos--;) {
        res <<= 1;
        res |= updatePoint(x * 8 + bitpos, y);
      }
      *dst++ = res;
    }
  }
}


void stepTime16() {
  uint16_t *dst = (uint16_t *) tempGrid;
  for (uint16_t y = 0; y < GRID_Y; y++) {
    for (uint16_t x = 0; x < GRID_X / 16; x++) {
      uint16_t res = 0;
      for (uint8_t bitpos = 16; bitpos--;) {
        res <<= 1;
        res |= updatePoint(x * 16 + bitpos, y);
      }
      *dst++ = res;
    }
  }
}


void stepTime32() {
  uint32_t *dst = (uint32_t *) tempGrid;
  for (uint16_t y = 0; y < GRID_Y; y++) {
    for (uint16_t x = 0; x < GRID_X / 32; x++) {
      uint32_t res = 0;
      for (uint8_t bitpos = 32; bitpos--;) {
        res <<= 1;
        res |= updatePoint(x * 32 + bitpos, y);
      }
      *dst++ = res;
    }
  }
}


void paintGrid(bool fullGrid) {

  bool  tempGridVal;

  if (fullGrid) {
    for (int y = 0; y < GRID_Y; y++) {
      for (int x = 0; x < GRID_X; x++) {
        if (getGrid(grid, x, y)) {
          whitePixel(x, y);
        } else {
          blackPixel(x, y);
        }
      }
    }
  } else {
    for (int y = 0; y < GRID_Y; y++) {
      for (int x = 0; x < GRID_X; x++) {
        tempGridVal = getGrid(tempGrid, x, y);
        // if (getGrid(grid, x, y) != tempGridVal) {  // 250ms -> 160ms improvement on AVR
          if (tempGridVal) {
            whitePixel(x, y);
          } else {
            blackPixel(x, y);
          }
        // }
      }
    }
    tempToGrid();
  }
}


void paintRasterGrid(byte src[][BYTES_X]) {
  byte *grid = (byte *) src;
  for (uint16_t y = 0; y < GRID_Y; y++) {
    for (uint16_t x = 0; x < GRID_X; x += 8) {
      byte gridval = *grid++;
      for (uint8_t bitpos = 0; bitpos < 8; bitpos++) {
        if (gridval & 1)
          whitePixel(x+bitpos, y);
        else
          blackPixel(x+bitpos, y);
        gridval >>= 1;
      }
    }
  }
}


#pragma GCC pop_options


void loop() {

  uint32_t t1 = millis();
  // stepTime();
  stepTime8();
  // stepTime16();
  // stepTime32();

  uint32_t t2 = millis();
  // paintGrid(false);
  paintRasterGrid(tempGrid);
  tempToGrid();

  uint32_t t3 = millis();

  // return;
  // print frames per second
  uint16_t sample_frames = 1;
  static uint16_t frame;
  static uint32_t last_ms;
  if (++frame == sample_frames) {
    frame = 0;
    Serial.print(t2 - t1);
    Serial.print(" ");
    Serial.print(t3 - t2);
    Serial.print(" ");
    Serial.print(sample_frames * 1000.0 / (millis() - last_ms));
    Serial.println(" FPS");
    last_ms = millis();
  }
}